diff --git a/.config/hakari.toml b/.config/hakari.toml index 9913ecc9c0..b5990d090e 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -23,10 +23,30 @@ platforms = [ ] [final-excludes] -# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but -# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded -# from depending on workspace-hack because most of the dependencies are not used. -workspace-members = ["vm_monitor"] +workspace-members = [ + # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but + # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded + # from depending on workspace-hack because most of the dependencies are not used. + "vm_monitor", + # All of these exist in libs and are not usually built independently. + # Putting workspace hack there adds a bottleneck for cargo builds. + "compute_api", + "consumption_metrics", + "desim", + "metrics", + "pageserver_api", + "postgres_backend", + "postgres_connection", + "postgres_ffi", + "pq_proto", + "remote_storage", + "safekeeper_api", + "tenant_size_model", + "tracing-utils", + "utils", + "wal_craft", + "walproposer", +] # Write out exact versions rather than a semver range. (Defaults to false.) # exact-versions = true diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..6ba6b3c887 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# allows for nicer hunk headers with git show +*.rs diff=rust diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 37983798b7..4ad8a7b460 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -1,13 +1,15 @@ self-hosted-runner: labels: - arm64 - - gen3 - large - large-arm64 - small - small-arm64 - us-east-2 config-variables: + - BENCHMARK_PROJECT_ID_PUB + - BENCHMARK_PROJECT_ID_SUB - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_UPCOMING_RELEASE_CHANNEL_ID + - DEV_AWS_OIDC_ROLE_ARN diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index f84beff20c..11adc8df86 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -183,7 +183,7 @@ runs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Store Allure test stat in the DB (new) if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index ce26e7825b..01c216b1ac 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -26,7 +26,7 @@ runs: TARGET: ${{ inputs.path }} ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} - PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id, github.run_attempt) }} run: | BUCKET=neon-github-public-dev FILENAME=$(basename $ARCHIVE) diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 16759ad038..f4a194639f 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -9,16 +9,13 @@ inputs: description: 'Region ID, if not set the project will be created in the default region' default: aws-us-east-2 postgres_version: - description: 'Postgres version; default is 15' - default: '15' + description: 'Postgres version; default is 16' + default: '16' api_host: description: 'Neon API host' default: console-stage.neon.build - provisioner: - description: 'k8s-pod or k8s-neonvm' - default: 'k8s-pod' compute_units: - description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' + description: '[Min, Max] compute units' default: '[1, 1]' outputs: @@ -37,10 +34,6 @@ runs: # A shell without `set -x` to not to expose password/dsn in logs shell: bash -euo pipefail {0} run: | - if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then - echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU" - fi - project=$(curl \ "https://${API_HOST}/api/v2/projects" \ --fail \ @@ -52,7 +45,7 @@ runs: \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", \"pg_version\": ${POSTGRES_VERSION}, \"region_id\": \"${REGION_ID}\", - \"provisioner\": \"${PROVISIONER}\", + \"provisioner\": \"k8s-neonvm\", \"autoscaling_limit_min_cu\": ${MIN_CU}, \"autoscaling_limit_max_cu\": ${MAX_CU}, \"settings\": { } @@ -75,6 +68,5 @@ runs: API_KEY: ${{ inputs.api_key }} REGION_ID: ${{ inputs.region_id }} POSTGRES_VERSION: ${{ inputs.postgres_version }} - PROVISIONER: ${{ inputs.provisioner }} MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index d9e543d4bb..6c2cee0971 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -43,7 +43,7 @@ inputs: pg_version: description: 'Postgres version to use for tests' required: false - default: 'v14' + default: 'v16' benchmark_durations: description: 'benchmark durations JSON' required: false @@ -56,14 +56,14 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon - name: Download Neon binaries for the previous release if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon-previous prefix: latest @@ -83,13 +83,12 @@ runs: uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 - name: Cache poetry deps uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} @@ -114,6 +113,8 @@ runs: export PLATFORM=${PLATFORM:-github-actions-selfhosted} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} export DEFAULT_PG_VERSION=${PG_VERSION#v} + export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib + export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 @@ -129,8 +130,8 @@ runs: exit 1 fi if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then - # -n16 uses sixteen processes to run tests via pytest-xdist - EXTRA_PARAMS="-n16 $EXTRA_PARAMS" + # -n sets the number of parallel processes that pytest-xdist will run + EXTRA_PARAMS="-n12 $EXTRA_PARAMS" # --dist=loadgroup points tests marked with @pytest.mark.xdist_group # to the same worker to make @pytest.mark.order work with xdist @@ -168,23 +169,28 @@ runs: EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS" fi - if [[ "${{ inputs.build_type }}" == "debug" ]]; then + if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) - elif [[ "${{ inputs.build_type }}" == "release" ]]; then - cov_prefix=() else cov_prefix=() fi # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + QUERIES=("SELECT version()") + if [[ "${PLATFORM}" = "neon"* ]]; then + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") + fi + + for q in "${QUERIES[@]}"; do + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}" + done fi # Run the tests. # - # The junit.xml file allows CI tools to display more fine-grained test information - # in its "Tests" tab in the results page. + # --alluredir saves test results in Allure format (in a specified directory) # --verbose prints name of each test (helpful when there are # multiple tests in one file) # -rA prints summary in the end @@ -193,7 +199,6 @@ runs: # mkdir -p $TEST_OUTPUT/allure/results "${cov_prefix[@]}" ./scripts/pytest \ - --junitxml=$TEST_OUTPUT/junit.xml \ --alluredir=$TEST_OUTPUT/allure/results \ --tb=short \ --verbose \ diff --git a/.github/actions/set-docker-config-dir/action.yml b/.github/actions/set-docker-config-dir/action.yml new file mode 100644 index 0000000000..3ee8bec8c6 --- /dev/null +++ b/.github/actions/set-docker-config-dir/action.yml @@ -0,0 +1,36 @@ +name: "Set custom docker config directory" +description: "Create a directory for docker config and set DOCKER_CONFIG" + +# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings +runs: + using: "composite" + steps: + - name: Show warning on GitHub-hosted runners + if: runner.environment == 'github-hosted' + shell: bash -euo pipefail {0} + run: | + # Using the following environment variables to find a path to the workflow file + # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch + # ${GITHUB_REPOSITORY} - octocat/hello-world + # ${GITHUB_REF} - refs/heads/my_branch + # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables + + filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"} + filename=${filename_with_ref%"@$GITHUB_REF"} + + # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message + title='Unnecessary usage of `.github/actions/set-docker-config-dir`' + message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners' + echo "::warning file=${filename},title=${title}::${message}" + + - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7 + env: + DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }} + with: + main: | + mkdir -p "${DOCKER_CONFIG}" + echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV + post: | + if [ -d "${DOCKER_CONFIG}" ]; then + rm -r "${DOCKER_CONFIG}" + fi diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index 63973dfbe7..edcece7d2b 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -8,7 +8,7 @@ inputs: description: "A directory or file to upload" required: true prefix: - description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false runs: @@ -45,7 +45,7 @@ runs: env: SOURCE: ${{ inputs.path }} ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst - PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id , github.run_attempt) }} run: | BUCKET=neon-github-public-dev FILENAME=$(basename $ARCHIVE) diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml new file mode 100644 index 0000000000..a52e43b4da --- /dev/null +++ b/.github/workflows/_benchmarking_preparation.yml @@ -0,0 +1,154 @@ +name: Prepare benchmarking databases by restoring dumps + +on: + workflow_call: + # no inputs needed + +defaults: + run: + shell: bash -euxo pipefail {0} + +jobs: + setup-databases: + strategy: + fail-fast: false + matrix: + platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] + database: [ clickbench, tpch, userexample ] + + env: + LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib + PLATFORM: ${{ matrix.platform }} + PG_BINARIES: /tmp/neon/pg_install/v16/bin + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + options: --init + + steps: + - name: Set up Connection String + id: set-up-prep-connstr + run: | + case "${PLATFORM}" in + neon) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} + ;; + aws-rds-postgres) + CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} + ;; + aws-aurora-serverless-v2-postgres) + CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} + ;; + *) + echo >&2 "Unknown PLATFORM=${PLATFORM}" + exit 1 + ;; + esac + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + # we create a table that has one row for each database that we want to restore with the status whether the restore is done + - name: Create benchmark_restore_status table if it does not exist + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + # to avoid a race condition of multiple jobs trying to create the table at the same time, + # we use an advisory lock + run: | + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " + SELECT pg_advisory_lock(4711); + CREATE TABLE IF NOT EXISTS benchmark_restore_status ( + databasename text primary key, + restore_done boolean + ); + SELECT pg_advisory_unlock(4711); + " + + - name: Check if restore is already done + id: check-restore-done + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + skip=false + if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then + echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database." + skip=true + fi + echo "skip=${skip}" | tee -a $GITHUB_OUTPUT + + - name: Check and create database if it does not exist + if: steps.check-restore-done.outputs.skip != 'true' + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'") + if [ "$DB_EXISTS" != "1" ]; then + echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..." + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";" + else + echo "Database ${{ env.DATABASE_NAME }} already exists." + fi + + - name: Download dump from S3 to /tmp/dumps + if: steps.check-restore-done.outputs.skip != 'true' + env: + DATABASE_NAME: ${{ matrix.database }} + run: | + mkdir -p /tmp/dumps + aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ + + - name: Replace database name in connection string + if: steps.check-restore-done.outputs.skip != 'true' + id: replace-dbname + env: + DATABASE_NAME: ${{ matrix.database }} + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + run: | + # Extract the part before the database name + base_connstr="${BENCHMARK_CONNSTR%/*}" + # Extract the query parameters (if any) after the database name + query_params="${BENCHMARK_CONNSTR#*\?}" + # Reconstruct the new connection string + if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then + new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}" + else + new_connstr="${base_connstr}/${DATABASE_NAME}" + fi + echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT + + - name: Restore dump + if: steps.check-restore-done.outputs.skip != 'true' + env: + DATABASE_NAME: ${{ matrix.database }} + DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }} + # the following works only with larger computes: + # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7" + # we add the || true because: + # the dumps were created with Neon and contain neon extensions that are not + # available in RDS, so we will always report an error, but we can ignore it + run: | + ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \ + -d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true + + - name: Update benchmark_restore_status table + if: steps.check-restore-done.outputs.skip != 'true' + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " + INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true) + ON CONFLICT (databasename) DO UPDATE SET restore_done = true; + " diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml new file mode 100644 index 0000000000..5e9fff0e6a --- /dev/null +++ b/.github/workflows/_build-and-test-locally.yml @@ -0,0 +1,297 @@ +name: Build and Test Locally + +on: + workflow_call: + inputs: + arch: + description: 'x64 or arm64' + required: true + type: string + build-tag: + description: 'build tag' + required: true + type: string + build-tools-image: + description: 'build-tools image' + required: true + type: string + build-type: + description: 'debug or release' + required: true + type: string + pg-versions: + description: 'a json array of postgres versions to run regression tests on' + required: true + type: string + +defaults: + run: + shell: bash -euxo pipefail {0} + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + +jobs: + build-neon: + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # Raise locked memory limit for tokio-epoll-uring. + # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), + # io_uring will account the memory of the CQ and SQ as locked. + # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + env: + BUILD_TYPE: ${{ inputs.build-type }} + GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} + BUILD_TAG: ${{ inputs.build-tag }} + + steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done + + - uses: actions/checkout@v4 + with: + submodules: true + + - name: Set pg 14 revision for caching + id: pg_v14_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT + + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + + - name: Set pg 16 revision for caching + id: pg_v16_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + + # Set some environment variables used by all the steps. + # + # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. + # It also includes --features, if any + # + # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, + # because "cargo metadata" doesn't accept --release or --debug options + # + # We run tests with addtional features, that are turned off by default (e.g. in release builds), see + # corresponding Cargo.toml files for their descriptions. + - name: Set env variables + env: + ARCH: ${{ inputs.arch }} + run: | + CARGO_FEATURES="--features testing" + if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then + cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" + CARGO_FLAGS="--locked" + elif [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix="" + CARGO_FLAGS="--locked" + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix="" + CARGO_FLAGS="--locked --release" + fi + { + echo "cov_prefix=${cov_prefix}" + echo "CARGO_FEATURES=${CARGO_FEATURES}" + echo "CARGO_FLAGS=${CARGO_FLAGS}" + echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" + } >> $GITHUB_ENV + + - name: Cache postgres v14 build + id: cache_pg_14 + uses: actions/cache@v4 + with: + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v4 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Cache postgres v16 build + id: cache_pg_16 + uses: actions/cache@v4 + with: + path: pg_install/v16 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: mold -run make postgres-v14 -j$(nproc) + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: mold -run make postgres-v15 -j$(nproc) + + - name: Build postgres v16 + if: steps.cache_pg_16.outputs.cache-hit != 'true' + run: mold -run make postgres-v16 -j$(nproc) + + - name: Build neon extensions + run: mold -run make neon-pg-ext -j$(nproc) + + - name: Build walproposer-lib + run: mold -run make walproposer-lib -j$(nproc) + + - name: Run cargo build + run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests + + # Do install *before* running rust tests because they might recompile the + # binaries with different features/flags. + - name: Install rust binaries + env: + ARCH: ${{ inputs.arch }} + run: | + # Install target binaries + mkdir -p /tmp/neon/bin/ + binaries=$( + ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | + jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' + ) + for bin in $binaries; do + SRC=target/$BUILD_TYPE/$bin + DST=/tmp/neon/bin/$bin + cp "$SRC" "$DST" + done + + # Install test executables and write list of all binaries (for code coverage) + if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then + # Keep bloated coverage data files away from the rest of the artifact + mkdir -p /tmp/coverage/ + + mkdir -p /tmp/neon/test_bin/ + + test_exe_paths=$( + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | + jq -r '.executable | select(. != null)' + ) + for bin in $test_exe_paths; do + SRC=$bin + DST=/tmp/neon/test_bin/$(basename $bin) + + # We don't need debug symbols for code coverage, so strip them out to make + # the artifact smaller. + strip "$SRC" -o "$DST" + echo "$DST" >> /tmp/coverage/binaries.list + done + + for bin in $binaries; do + echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list + done + fi + + - name: Run rust tests + env: + NEXTEST_RETRIES: 3 + run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR + LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib + export LD_LIBRARY_PATH + + #nextest does not yet support running doctests + ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + + for io_engine in std-fs tokio-epoll-uring ; do + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES + done + + # Run separate tests for real S3 + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests + export REMOTE_STORAGE_S3_REGION=eu-central-1 + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)' + + # Run separate tests for real Azure Blob Storage + # XXX: replace region with `eu-central-1`-like region + export ENABLE_REAL_AZURE_REMOTE_STORAGE=y + export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" + export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" + export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" + export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)' + + - name: Install postgres binaries + run: cp -a pg_install /tmp/neon/pg_install + + - name: Upload Neon artifact + uses: ./.github/actions/upload + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact + path: /tmp/neon + + # XXX: keep this after the binaries.list is formed, so the coverage can properly work later + - name: Merge and upload coverage data + if: inputs.build-type == 'debug' + uses: ./.github/actions/save-coverage-data + + regress-tests: + # Don't run regression tests on debug arm64 builds + if: inputs.build-type != 'debug' || inputs.arch != 'arm64' + needs: [ build-neon ] + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # for changed limits, see comments on `options:` earlier in this file + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + strategy: + fail-fast: false + matrix: + pg_version: ${{ fromJson(inputs.pg-versions) }} + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - name: Pytest regression tests + uses: ./.github/actions/run-python-test-set + timeout-minutes: 60 + with: + build_type: ${{ inputs.build-type }} + test_selection: regress + needs_postgres_source: true + run_with_real_s3: true + real_s3_bucket: neon-github-ci-tests + real_s3_region: eu-central-1 + rerun_flaky: true + pg_version: ${{ matrix.pg_version }} + env: + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + CHECK_ONDISK_DATA_COMPATIBILITY: nonempty + BUILD_TAG: ${{ inputs.build-tag }} + PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + + # Temporary disable this step until we figure out why it's so flaky + # Ref https://github.com/neondatabase/neon/issues/4540 + - name: Merge and upload coverage data + if: | + false && + inputs.build-type == 'debug' && matrix.pg_version == 'v16' + uses: ./.github/actions/save-coverage-data diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 078c7f88c4..85cfe7446e 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -36,15 +36,16 @@ jobs: fail_on_error: true filter_mode: nofilter level: error - - run: | + + - name: Disallow 'ubuntu-latest' runners + run: | PAT='^\s*runs-on:.*-latest' - if grep -ERq $PAT .github/workflows - then + if grep -ERq $PAT .github/workflows; then grep -ERl $PAT .github/workflows |\ while read -r f do - l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1) - echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead." + l=$(grep -nE $PAT $f | awk -F: '{print $1}' | head -1) + echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'" done exit 1 fi diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 9eff483680..a4a597acde 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -56,28 +56,53 @@ concurrency: jobs: bench: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners + strategy: + fail-fast: false + matrix: + include: + - DEFAULT_PG_VERSION: 16 + PLATFORM: "neon-staging" + region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + RUNNER: [ self-hosted, us-east-2, x64 ] + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + - DEFAULT_PG_VERSION: 16 + PLATFORM: "azure-staging" + region_id: 'azure-eastus2' + RUNNER: [ self-hosted, eastus2, x64 ] + IMAGE: neondatabase/build-tools:pinned env: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} - PLATFORM: "neon-staging" + PLATFORM: ${{ matrix.PLATFORM }} - runs-on: [ self-hosted, us-east-2, x64 ] + runs-on: ${{ matrix.RUNNER }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: ${{ matrix.IMAGE }} options: --init steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials # necessary on Azure runners + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest @@ -85,7 +110,7 @@ jobs: id: create-neon-project uses: ./.github/actions/neon-project-create with: - region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + region_id: ${{ matrix.region_id }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} @@ -96,10 +121,18 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} + pg_version: ${{ env.DEFAULT_PG_VERSION }} # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests - extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py + extra_params: + -m remote_cluster + --sparse-ordering + --timeout 14400 + --ignore test_runner/performance/test_perf_olap.py + --ignore test_runner/performance/test_perf_pgvector_queries.py + --ignore test_runner/performance/test_logical_replication.py + --ignore test_runner/performance/test_physical_replication.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -113,6 +146,7 @@ jobs: api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -121,7 +155,87 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic perf testing: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + replication-tests: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + env: + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 16 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: "neon-staging" + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + options: --init + + steps: + - uses: actions/checkout@v4 + + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Run Logical Replication benchmarks + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_logical_replication.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 5400 + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }} + BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }} + + - name: Run Physical Replication benchmarks + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_physical_replication.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 5400 + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + id: create-allure-report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream + slack-message: | + Periodic replication testing: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} @@ -130,13 +244,16 @@ jobs: # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) # # Available platforms: - # - neon-captest-new: Freshly created project (1 CU) - # - neon-captest-freetier: Use freetier-sized compute (0.25 CU) - # - neon-captest-reuse: Reusing existing project + # - neonvm-captest-new: Freshly created project (1 CU) + # - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU) + # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region + # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region + # - neonvm-captest-reuse: Reusing existing project # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage env: RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }} + DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} runs-on: ubuntu-22.04 outputs: pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }} @@ -147,23 +264,37 @@ jobs: - name: Generate matrix for pgbench benchmark id: pgbench-compare-matrix run: | + region_id_default=${{ env.DEFAULT_REGION_ID }} + runner_default='["self-hosted", "us-east-2", "x64"]' + runner_azure='["self-hosted", "eastus2", "x64"]' + image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned" matrix='{ + "pg_version" : [ + 16 + ], + "region_id" : [ + "'"$region_id_default"'" + ], "platform": [ - "neon-captest-new", - "neon-captest-reuse", + "neonvm-captest-new", + "neonvm-captest-reuse", "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, - { "platform": "neon-captest-new", "db_size": "50gb" }, - { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, - { "platform": "neonvm-captest-new", "db_size": "50gb" }, - { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] + "runner": ['"$runner_default"'], + "image": [ "'"$image_default"'" ], + "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' - if [ "$(date +%A)" = "Saturday" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, - { "platform": "rds-aurora", "db_size": "50gb"}]') + if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -173,7 +304,7 @@ jobs: run: | matrix='{ "platform": [ - "neon-captest-reuse" + "neonvm-captest-reuse" ] }' @@ -189,7 +320,7 @@ jobs: run: | matrix='{ "platform": [ - "neon-captest-reuse" + "neonvm-captest-reuse" ], "scale": [ "10" @@ -203,9 +334,17 @@ jobs: echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT + prepare_AWS_RDS_databases: + uses: ./.github/workflows/_benchmarking_preparation.yml + secrets: inherit + pgbench-compare: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} - needs: [ generate-matrices ] + needs: [ generate-matrices, prepare_AWS_RDS_databases ] + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners strategy: fail-fast: false @@ -215,15 +354,15 @@ jobs: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} - runs-on: [ self-hosted, us-east-2, x64 ] + runs-on: ${{ matrix.runner }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: ${{ matrix.image }} options: --init # Increase timeout to 8h, default timeout is 6h @@ -232,40 +371,41 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials # necessary on Azure runners + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Create Neon Project - if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform) + if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: - region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + region_id: ${{ matrix.region_id }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }} - provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }} + compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }} - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; neonvm-captest-sharding-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} ;; - neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) + neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -282,16 +422,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: Benchmark init uses: ./.github/actions/run-python-test-set with: @@ -300,6 +430,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -313,6 +444,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -326,6 +458,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -339,6 +472,7 @@ jobs: api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -347,11 +481,29 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} pgbench-pgvector: + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners + strategy: + fail-fast: false + matrix: + include: + - PLATFORM: "neonvm-captest-pgvector" + RUNNER: [ self-hosted, us-east-2, x64 ] + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + - PLATFORM: "azure-captest-pgvector" + RUNNER: [ self-hosted, eastus2, x64 ] + IMAGE: neondatabase/build-tools:pinned + env: TEST_PG_BENCH_DURATIONS_MATRIX: "15m" TEST_PG_BENCH_SCALES_MATRIX: "1" @@ -359,43 +511,60 @@ jobs: DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote + LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} - PLATFORM: "neon-captest-pgvector" + PLATFORM: ${{ matrix.PLATFORM }} - runs-on: [ self-hosted, us-east-2, x64 ] + runs-on: ${{ matrix.RUNNER }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: ${{ matrix.IMAGE }} options: --init steps: - uses: actions/checkout@v4 - - name: Download Neon artifact - uses: ./.github/actions/download - with: - name: neon-${{ runner.os }}-release-artifact - path: /tmp/neon/ - prefix: latest - - - name: Add Postgres binaries to PATH + # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16 + # instead of using Neon artifacts containing pgbench + - name: Install postgresql-16 where pytest expects it run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH + cd /home/nonroot + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb + dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg + mkdir -p /tmp/neon/pg_install/v16/bin + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql + ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib + /tmp/neon/pg_install/v16/bin/pgbench --version + /tmp/neon/pg_install/v16/bin/psql --version - name: Set up Connection String id: set-up-connstr run: | - CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} - + case "${PLATFORM}" in + neonvm-captest-pgvector) + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} + ;; + azure-captest-pgvector) + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }} + ;; + *) + echo >&2 "Unknown PLATFORM=${PLATFORM}" + exit 1 + ;; + esac + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done + - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3 + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours - name: Benchmark pgvector hnsw indexing uses: ./.github/actions/run-python-test-set @@ -405,6 +574,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -417,13 +587,15 @@ jobs: test_selection: performance/test_perf_pgvector_queries.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} - extra_params: -m remote_cluster --timeout 21600 + extra_params: -m remote_cluster --timeout 21600 + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - + - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -432,11 +604,13 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - clickbench-compare: # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters # we use for performance testing in pgbench-compare. @@ -446,7 +620,7 @@ jobs: # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} - needs: [ generate-matrices, pgbench-compare ] + needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -454,7 +628,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }} TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }} @@ -473,20 +647,15 @@ jobs: - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }} ;; rds-aurora) @@ -496,23 +665,13 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }} ;; *) - echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set with: @@ -521,6 +680,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_clickbench + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -530,6 +690,7 @@ jobs: TEST_OLAP_SCALE: 10 - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -538,7 +699,10 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} @@ -550,7 +714,7 @@ jobs: # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} - needs: [ generate-matrices, clickbench-compare ] + needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -558,7 +722,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -576,29 +740,24 @@ jobs: - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Get Connstring Secret Name run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) ENV_PLATFORM=CAPTEST_TPCH ;; rds-aurora) ENV_PLATFORM=RDS_AURORA_TPCH ;; rds-postgres) - ENV_PLATFORM=RDS_AURORA_TPCH + ENV_PLATFORM=RDS_POSTGRES_TPCH ;; *) - echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -613,16 +772,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set with: @@ -631,6 +780,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_tpch + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -638,6 +788,7 @@ jobs: TEST_OLAP_SCALE: ${{ matrix.scale }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -646,13 +797,16 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} user-examples-compare: if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} - needs: [ generate-matrices, tpch-compare ] + needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -660,7 +814,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -677,20 +831,15 @@ jobs: - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} ;; rds-aurora) @@ -700,23 +849,13 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }} ;; *) - echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: Run user examples uses: ./.github/actions/run-python-test-set with: @@ -725,12 +864,14 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_user_examples + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -739,6 +880,10 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> + env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 6e90a80ab7..ca5ff573e1 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -38,7 +38,7 @@ jobs: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} env: IMAGE_TAG: ${{ inputs.image-tag }} @@ -56,35 +56,33 @@ jobs: - uses: actions/checkout@v4 - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p /tmp/.docker-custom - echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV + - uses: ./.github/actions/set-docker-config-dir + - uses: docker/setup-buildx-action@v3 + with: + cache-binary: false - - uses: docker/setup-buildx-action@v2 - - - uses: docker/login-action@v2 + - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/build-push-action@v4 + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} + + - uses: docker/build-push-action@v6 with: context: . provenance: false push: true pull: true file: Dockerfile.build-tools - cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }} - cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max + cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }} tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} - - name: Remove custom docker config directory - run: | - rm -rf /tmp/.docker-custom - merge-images: needs: [ build-image ] runs-on: ubuntu-22.04 diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8c8500260c..1e7f3598c2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -30,7 +30,7 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: - github-event-name: ${{ github.event_name}} + github-event-name: ${{ github.event_name }} cancel-previous-e2e-tests: needs: [ check-permissions ] @@ -48,7 +48,7 @@ jobs: tag: needs: [ check-permissions ] - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} @@ -90,7 +90,7 @@ jobs: check-codestyle-python: needs: [ check-permissions, build-build-tools-image ] - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -101,15 +101,12 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 - with: - submodules: false - fetch-depth: 1 - name: Cache poetry deps uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -125,7 +122,11 @@ jobs: check-codestyle-rust: needs: [ check-permissions, build-build-tools-image ] - runs-on: [ self-hosted, gen3, small ] + strategy: + matrix: + arch: [ x64, arm64 ] + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -138,7 +139,6 @@ jobs: uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 # Disabled for now # - name: Restore cargo deps cache @@ -149,7 +149,7 @@ jobs: # !~/.cargo/registry/src # ~/.cargo/git/ # target/ -# key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} +# key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers @@ -193,295 +193,40 @@ jobs: if: ${{ !cancelled() }} run: cargo deny check --hide-inclusion-graph - build-neon: - needs: [ check-permissions, tag, build-build-tools-image ] - runs-on: [ self-hosted, gen3, large ] - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - # Raise locked memory limit for tokio-epoll-uring. - # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), - # io_uring will account the memory of the CQ and SQ as locked. - # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 - options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + build-and-test-locally: + needs: [ tag, build-build-tools-image ] strategy: fail-fast: false matrix: - build_type: [ debug, release ] - env: - BUILD_TYPE: ${{ matrix.build_type }} - GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG: ${{ needs.tag.outputs.build-tag }} - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - - name: Set pg 14 revision for caching - id: pg_v14_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - - - name: Set pg 15 revision for caching - id: pg_v15_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - - - name: Set pg 16 revision for caching - id: pg_v16_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT - - # Set some environment variables used by all the steps. - # - # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. - # It also includes --features, if any - # - # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, - # because "cargo metadata" doesn't accept --release or --debug options - # - # We run tests with addtional features, that are turned off by default (e.g. in release builds), see - # corresponding Cargo.toml files for their descriptions. - - name: Set env variables - run: | - CARGO_FEATURES="--features testing" - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FLAGS="--locked" - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix="" - CARGO_FLAGS="--locked --release" - fi - { - echo "cov_prefix=${cov_prefix}" - echo "CARGO_FEATURES=${CARGO_FEATURES}" - echo "CARGO_FLAGS=${CARGO_FLAGS}" - echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" - } >> $GITHUB_ENV - - # Disabled for now - # Don't include the ~/.cargo/registry/src directory. It contains just - # uncompressed versions of the crates in ~/.cargo/registry/cache - # directory, and it's faster to let 'cargo' to rebuild it from the - # compressed crates. -# - name: Cache cargo deps -# id: cache_cargo -# uses: actions/cache@v4 -# with: -# path: | -# ~/.cargo/registry/ -# !~/.cargo/registry/src -# ~/.cargo/git/ -# target/ -# # Fall back to older versions of the key, if no cache for current Cargo.lock was found -# key: | -# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} -# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}- - - - name: Cache postgres v14 build - id: cache_pg_14 - uses: actions/cache@v4 - with: - path: pg_install/v14 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - - - name: Cache postgres v15 build - id: cache_pg_15 - uses: actions/cache@v4 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - - - name: Cache postgres v16 build - id: cache_pg_16 - uses: actions/cache@v4 - with: - path: pg_install/v16 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make postgres-v14 -j$(nproc) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make postgres-v15 -j$(nproc) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make postgres-v16 -j$(nproc) - - - name: Build neon extensions - run: mold -run make neon-pg-ext -j$(nproc) - - - name: Build walproposer-lib - run: mold -run make walproposer-lib -j$(nproc) - - - name: Run cargo build - run: | - ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - - # Do install *before* running rust tests because they might recompile the - # binaries with different features/flags. - - name: Install rust binaries - run: | - # Install target binaries - mkdir -p /tmp/neon/bin/ - binaries=$( - ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | - jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' - ) - for bin in $binaries; do - SRC=target/$BUILD_TYPE/$bin - DST=/tmp/neon/bin/$bin - cp "$SRC" "$DST" - done - - # Install test executables and write list of all binaries (for code coverage) - if [[ $BUILD_TYPE == "debug" ]]; then - # Keep bloated coverage data files away from the rest of the artifact - mkdir -p /tmp/coverage/ - - mkdir -p /tmp/neon/test_bin/ - - test_exe_paths=$( - ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | - jq -r '.executable | select(. != null)' - ) - for bin in $test_exe_paths; do - SRC=$bin - DST=/tmp/neon/test_bin/$(basename $bin) - - # We don't need debug symbols for code coverage, so strip them out to make - # the artifact smaller. - strip "$SRC" -o "$DST" - echo "$DST" >> /tmp/coverage/binaries.list - done - - for bin in $binaries; do - echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list - done - fi - - - name: Run rust tests - env: - NEXTEST_RETRIES: 3 - run: | - #nextest does not yet support running doctests - cargo test --doc $CARGO_FLAGS $CARGO_FEATURES - - for io_engine in std-fs tokio-epoll-uring ; do - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES - done - - # Run separate tests for real S3 - export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests - export REMOTE_STORAGE_S3_REGION=eu-central-1 - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)' - - # Run separate tests for real Azure Blob Storage - # XXX: replace region with `eu-central-1`-like region - export ENABLE_REAL_AZURE_REMOTE_STORAGE=y - export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" - export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" - export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" - export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)' - - - name: Install postgres binaries - run: cp -a pg_install /tmp/neon/pg_install - - - name: Upload Neon artifact - uses: ./.github/actions/upload - with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact - path: /tmp/neon - - # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - - name: Merge and upload coverage data - if: matrix.build_type == 'debug' - uses: ./.github/actions/save-coverage-data - - regress-tests: - needs: [ check-permissions, build-neon, build-build-tools-image, tag ] - runs-on: [ self-hosted, gen3, large ] - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - # for changed limits, see comments on `options:` earlier in this file - options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - pg_version: [ v14, v15, v16 ] - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - - name: Pytest regression tests - uses: ./.github/actions/run-python-test-set - timeout-minutes: 60 - with: - build_type: ${{ matrix.build_type }} - test_selection: regress - needs_postgres_source: true - run_with_real_s3: true - real_s3_bucket: neon-github-ci-tests - real_s3_region: eu-central-1 - rerun_flaky: true - pg_version: ${{ matrix.pg_version }} - env: - TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - CHECK_ONDISK_DATA_COMPATIBILITY: nonempty - BUILD_TAG: ${{ needs.tag.outputs.build-tag }} - PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_IMPL: vectored - PAGESERVER_GET_IMPL: vectored - PAGESERVER_VALIDATE_VEC_GET: true - - # Temporary disable this step until we figure out why it's so flaky - # Ref https://github.com/neondatabase/neon/issues/4540 - - name: Merge and upload coverage data - if: | - false && - matrix.build_type == 'debug' && matrix.pg_version == 'v14' - uses: ./.github/actions/save-coverage-data + arch: [ x64, arm64 ] + # Do not build or run tests in debug for release branches + build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }} + include: + - build-type: release + arch: arm64 + uses: ./.github/workflows/_build-and-test-locally.yml + with: + arch: ${{ matrix.arch }} + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }} + build-tag: ${{ needs.tag.outputs.build-tag }} + build-type: ${{ matrix.build-type }} + # Run tests on all Postgres versions in release builds and only on the latest version in debug builds + pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }} + secrets: inherit + # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking get-benchmarks-durations: + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') outputs: json: ${{ steps.get-benchmark-durations.outputs.json }} needs: [ check-permissions, build-build-tools-image ] - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init - if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') steps: - name: Checkout uses: actions/checkout@v4 @@ -490,7 +235,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -506,8 +251,9 @@ jobs: echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT benchmarks: - needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ] - runs-on: [ self-hosted, gen3, small ] + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') + needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] + runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -515,7 +261,6 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} # for changed limits, see comments on `options:` earlier in this file options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 - if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: fail-fast: false matrix: @@ -535,14 +280,12 @@ jobs: save_perf_report: ${{ github.ref_name == 'main' }} extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} + pg_version: v16 env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_IMPL: vectored - PAGESERVER_GET_IMPL: vectored - PAGESERVER_VALIDATE_VEC_GET: false # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -556,19 +299,18 @@ jobs: with: channel-id: C060CNA47S9 # on-call-staging-storage-stream slack-message: | - Benchmarks failed on main: ${{ github.event.head_commit.url }} - - Allure report: ${{ needs.create-test-report.outputs.report-url }} + Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}> + <${{ needs.create-test-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} create-test-report: - needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ] + needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} outputs: report-url: ${{ steps.create-allure-report.outputs.report-url }} - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -614,8 +356,8 @@ jobs: }) coverage-report: - needs: [ check-permissions, regress-tests, build-build-tools-image ] - runs-on: [ self-hosted, gen3, small ] + needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] + runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -639,7 +381,7 @@ jobs: - name: Get Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact path: /tmp/neon - name: Get coverage artifact @@ -729,7 +471,7 @@ jobs: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout @@ -738,23 +480,29 @@ jobs: submodules: true fetch-depth: 0 - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p .docker-custom - echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v2 + - uses: ./.github/actions/set-docker-config-dir + - uses: docker/setup-buildx-action@v3 + with: + cache-binary: false - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/build-push-action@v5 + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} + + - uses: docker/build-push-action@v6 with: context: . + # ARM-specific flags are recommended for Graviton ≥ 2, these flags are also supported by Ampere Altra (Azure) + # https://github.com/aws/aws-graviton-getting-started/blob/57dc813626d0266f1cc12ef83474745bb1f31fb4/rust.md build-args: | + ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }} @@ -762,16 +510,11 @@ jobs: push: true pull: true file: Dockerfile - cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }} - cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max + cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }} tags: | neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - - name: Remove custom docker config directory - if: always() - run: | - rm -rf .docker-custom - neon-image: needs: [ neon-image-arch, tag ] runs-on: ubuntu-22.04 @@ -807,7 +550,7 @@ jobs: version: [ v14, v15, v16 ] arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout @@ -816,17 +559,13 @@ jobs: submodules: true fetch-depth: 0 - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p .docker-custom - echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v2 + - uses: ./.github/actions/set-docker-config-dir + - uses: docker/setup-buildx-action@v3 with: + cache-binary: false # Disable parallelism for docker buildkit. # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. - config-inline: | + buildkitd-config-inline: | [worker.oci] max-parallelism = 1 @@ -841,8 +580,14 @@ jobs: username: ${{ secrets.AWS_ACCESS_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }} + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} + - name: Build compute-node image - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: . build-args: | @@ -854,14 +599,14 @@ jobs: push: true pull: true file: Dockerfile.compute-node - cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} - cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} tags: | neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version == 'v16' - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: . build-args: | @@ -874,15 +619,15 @@ jobs: pull: true file: Dockerfile.compute-node target: neon-pg-ext-test - cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }} - cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} tags: | neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once if: matrix.version == 'v16' - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: target: compute-tools-image context: . @@ -897,11 +642,6 @@ jobs: tags: | neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - - name: Remove custom docker config directory - if: always() - run: | - rm -rf .docker-custom - compute-node-image: needs: [ compute-node-image-arch, tag ] runs-on: ubuntu-22.04 @@ -955,7 +695,7 @@ jobs: vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] - runs-on: [ self-hosted, gen3, large ] + runs-on: [ self-hosted, large ] strategy: fail-fast: false matrix: @@ -974,13 +714,7 @@ jobs: curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p .docker-custom - echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - + - uses: ./.github/actions/set-docker-config-dir - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} @@ -1003,11 +737,6 @@ jobs: run: | docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - - name: Remove custom docker config directory - if: always() - run: | - rm -rf .docker-custom - test-images: needs: [ check-permissions, tag, neon-image, compute-node-image ] strategy: @@ -1015,7 +744,7 @@ jobs: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - name: Checkout @@ -1023,6 +752,12 @@ jobs: with: fetch-depth: 0 + - uses: ./.github/actions/set-docker-config-dir + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like @@ -1058,6 +793,9 @@ jobs: docker compose -f ./docker-compose/docker-compose.yml down promote-images: + permissions: + contents: read # This is required for actions/checkout + id-token: write # This is required for Azure Login to work. needs: [ check-permissions, tag, test-images, vm-compute-node-image ] runs-on: ubuntu-22.04 @@ -1084,6 +822,28 @@ jobs: neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} done + - name: Azure login + if: github.ref_name == 'main' + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} + + - name: Login to ACR + if: github.ref_name == 'main' + run: | + az acr login --name=neoneastus2 + + - name: Copy docker images to ACR-dev + if: github.ref_name == 'main' + run: | + for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do + docker buildx imagetools create \ + -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/${image}:${{ needs.tag.outputs.build-tag }} + done + - name: Add latest tag to images if: github.ref_name == 'main' run: | @@ -1196,10 +956,10 @@ jobs: exit 1 deploy: - needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ] + needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest steps: - name: Fix git ownership @@ -1219,7 +979,6 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: - submodules: false fetch-depth: 0 - name: Trigger deploy workflow @@ -1227,9 +986,10 @@ jobs: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then - gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false + gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false + gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \ + gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=false \ -f deployProxy=false \ -f deployStorage=true \ @@ -1239,14 +999,14 @@ jobs: -f dockerTag=${{needs.tag.outputs.build-tag}} \ -f deployPreprodRegion=true - gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ + gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \ -f deployStorage=true \ -f deployStorageBroker=true \ -f deployStorageController=true \ -f branch=main \ -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then - gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \ + gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=true \ -f deployProxy=true \ -f deployStorage=false \ @@ -1256,7 +1016,7 @@ jobs: -f dockerTag=${{needs.tag.outputs.build-tag}} \ -f deployPreprodRegion=true - gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \ + gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \ -f deployPgSniRouter=true \ -f deployProxy=true \ -f branch=main \ @@ -1296,10 +1056,10 @@ jobs: }) promote-compatibility-data: - needs: [ check-permissions, promote-images, tag, regress-tests ] + needs: [ check-permissions, promote-images, tag, build-and-test-locally ] if: github.ref_name == 'release' - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init @@ -1308,6 +1068,7 @@ jobs: env: BUCKET: neon-github-public-dev PREFIX: artifacts/latest + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} run: | # Update compatibility snapshot for the release for pg_version in v14 v15 v16; do @@ -1321,8 +1082,8 @@ jobs: # Update Neon artifact for the release (reuse already uploaded artifact) for build_type in debug release; do - OLD_PREFIX=artifacts/${GITHUB_RUN_ID} - FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst + OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID} + FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) if [ -z "${S3_KEY}" ]; then @@ -1334,9 +1095,39 @@ jobs: done pin-build-tools-image: - needs: [ build-build-tools-image, promote-images, regress-tests ] + needs: [ build-build-tools-image, promote-images, build-and-test-locally ] if: github.ref_name == 'main' uses: ./.github/workflows/pin-build-tools-image.yml with: from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }} secrets: inherit + + # This job simplifies setting branch protection rules (in GitHub UI) + # by allowing to set only this job instead of listing many others. + # It also makes it easier to rename or parametrise jobs (using matrix) + # which requires changes in branch protection rules + # + # Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that. + # + # https://github.com/neondatabase/neon/settings/branch_protection_rules + conclusion: + if: always() + # Format `needs` differently to make the list more readable. + # Usually we do `needs: [...]` + needs: + - build-and-test-locally + - check-codestyle-python + - check-codestyle-rust + - promote-images + - test-images + - trigger-custom-extensions-build-and-wait + runs-on: ubuntu-22.04 + steps: + # The list of possible results: + # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context + - name: Fail the job if any of the dependencies do not succeed + run: exit 1 + if: | + contains(needs.*.result, 'failure') + || contains(needs.*.result, 'cancelled') + || contains(needs.*.result, 'skipped') diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml new file mode 100644 index 0000000000..585d118dfb --- /dev/null +++ b/.github/workflows/label-for-external-users.yml @@ -0,0 +1,54 @@ +name: Add `external` label to issues and PRs created by external users + +on: + issues: + types: + - opened + pull_request_target: + types: + - opened + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +env: + LABEL: external + +jobs: + check-user: + runs-on: ubuntu-22.04 + + outputs: + is-member: ${{ steps.check-user.outputs.is-member }} + + steps: + - name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}` + id: check-user + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then + is_member=true + else + is_member=false + fi + + echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT} + + add-label: + if: needs.check-user.outputs.is-member == 'false' + needs: [ check-user ] + + runs-on: ubuntu-22.04 + permissions: + pull-requests: write # for `gh pr edit` + issues: write # for `gh issue edit` + + steps: + - name: Add `${{ env.LABEL }}` label + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }} + GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }} + run: | + gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 7d2187e59c..7fecdbde8c 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -56,7 +56,6 @@ jobs: uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 - name: Install macOS postgres dependencies run: brew install flex bison openssl protobuf icu4c pkg-config @@ -133,214 +132,6 @@ jobs: - name: Check that no warnings are produced run: ./run_clippy.sh - check-linux-arm-build: - needs: [ check-permissions, build-build-tools-image ] - timeout-minutes: 90 - runs-on: [ self-hosted, small-arm64 ] - - env: - # Use release build only, to have less debug info around - # Hence keeping target/ (and general cache size) smaller - BUILD_TYPE: release - CARGO_FEATURES: --features testing - CARGO_FLAGS: --release - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} - - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - options: --init - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - - name: Set pg 14 revision for caching - id: pg_v14_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - - - name: Set pg 15 revision for caching - id: pg_v15_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - - - name: Set pg 16 revision for caching - id: pg_v16_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT - - - name: Set env variables - run: | - echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV - - - name: Cache postgres v14 build - id: cache_pg_14 - uses: actions/cache@v4 - with: - path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v15 build - id: cache_pg_15 - uses: actions/cache@v4 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v16 build - id: cache_pg_16 - uses: actions/cache@v4 - with: - path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make postgres-v14 -j$(nproc) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make postgres-v15 -j$(nproc) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make postgres-v16 -j$(nproc) - - - name: Build neon extensions - run: mold -run make neon-pg-ext -j$(nproc) - - - name: Build walproposer-lib - run: mold -run make walproposer-lib -j$(nproc) - - - name: Run cargo build - run: | - mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc) - - - name: Run cargo test - env: - NEXTEST_RETRIES: 3 - run: | - cargo nextest run $CARGO_FEATURES -j$(nproc) - - # Run separate tests for real S3 - export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests - export REMOTE_STORAGE_S3_REGION=eu-central-1 - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc) - - # Run separate tests for real Azure Blob Storage - # XXX: replace region with `eu-central-1`-like region - export ENABLE_REAL_AZURE_REMOTE_STORAGE=y - export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" - export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" - export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" - export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_azure -j$(nproc) - - check-codestyle-rust-arm: - needs: [ check-permissions, build-build-tools-image ] - timeout-minutes: 90 - runs-on: [ self-hosted, small-arm64 ] - - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - options: --init - - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - # Some of our rust modules use FFI and need those to be checked - - name: Get postgres headers - run: make postgres-headers -j$(nproc) - - # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. - # This will catch compiler & clippy warnings in all feature combinations. - # TODO: use cargo hack for build and test as well, but, that's quite expensive. - # NB: keep clippy args in sync with ./run_clippy.sh - - run: | - CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" - if [ "$CLIPPY_COMMON_ARGS" = "" ]; then - echo "No clippy args found in .neon_clippy_args" - exit 1 - fi - echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV - - - name: Run cargo clippy (debug) - if: matrix.build_type == 'debug' - run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS - - name: Run cargo clippy (release) - if: matrix.build_type == 'release' - run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS - - - name: Check documentation generation - if: matrix.build_type == 'release' - run: cargo doc --workspace --no-deps --document-private-items -j$(nproc) - env: - RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" - - # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - - name: Check formatting - if: ${{ !cancelled() && matrix.build_type == 'release' }} - run: cargo fmt --all -- --check - - # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - - name: Check rust dependencies - if: ${{ !cancelled() && matrix.build_type == 'release' }} - run: | - cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date - cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - - # https://github.com/EmbarkStudios/cargo-deny - - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() && matrix.build_type == 'release' }} - run: cargo deny check - gather-rust-build-stats: needs: [ check-permissions, build-build-tools-image ] if: | @@ -357,8 +148,6 @@ jobs: env: BUILD_TYPE: release - # remove the cachepot wrapper and build without crate caches - RUSTC_WRAPPER: "" # build with incremental compilation produce partial results # so do not attempt to cache this build, also disable the incremental compilation CARGO_INCREMENTAL: 0 @@ -368,7 +157,6 @@ jobs: uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers @@ -378,7 +166,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: cargo build --all --release --timings -j$(nproc) + run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc) - name: Upload the build stats id: upload-stats diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml new file mode 100644 index 0000000000..615937b5a1 --- /dev/null +++ b/.github/workflows/periodic_pagebench.yml @@ -0,0 +1,155 @@ +name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 18 * * *' # Runs at 6 PM UTC every day + workflow_dispatch: # Allows manual triggering of the workflow + inputs: + commit_hash: + type: string + description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.' + required: false + default: '' + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +jobs: + trigger_bench_on_ec2_machine_in_eu_central_1: + runs-on: [ self-hosted, small ] + container: + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + timeout-minutes: 360 # Set the timeout to 6 hours + env: + API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }} + RUN_ID: ${{ github.run_id }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }} + AWS_DEFAULT_REGION : "eu-central-1" + AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74" + steps: + # we don't need the neon source code because we run everything remotely + # however we still need the local github actions to run the allure step below + - uses: actions/checkout@v4 + + - name: Show my own (github runner) external IP address - usefull for IP allowlisting + run: curl https://ifconfig.me + + - name: Start EC2 instance and wait for the instance to boot up + run: | + aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID + aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID + sleep 60 # sleep some time to allow cloudinit and our API server to start up + + - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US + run: | + public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text) + echo "Public IP of the EC2 instance: $public_ip" + echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV + + - name: Determine commit hash + env: + INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }} + run: | + if [ -z "$INPUT_COMMIT_HASH" ]; then + echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV + else + echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV + fi + + - name: Start Bench with run_id + run: | + curl -k -X 'POST' \ + "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer $API_KEY" \ + -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}" + + - name: Poll Test Status + id: poll_step + run: | + status="" + while [[ "$status" != "failure" && "$status" != "success" ]]; do + response=$(curl -k -X 'GET' \ + "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer $API_KEY") + echo "Response: $response" + set +x + status=$(echo $response | jq -r '.status') + echo "Test status: $status" + if [[ "$status" == "failure" ]]; then + echo "Test failed" + exit 1 # Fail the job step if status is failure + elif [[ "$status" == "success" || "$status" == "null" ]]; then + break + elif [[ "$status" == "too_many_runs" ]]; then + echo "Too many runs already running" + echo "too_many_runs=true" >> "$GITHUB_OUTPUT" + exit 1 + fi + + sleep 60 # Poll every 60 seconds + done + + - name: Retrieve Test Logs + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + curl -k -X 'GET' \ + "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \ + -H 'accept: application/gzip' \ + -H "Authorization: Bearer $API_KEY" \ + --output "test_log_${GITHUB_RUN_ID}.gz" + + - name: Unzip Test Log and Print it into this job's log + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + gzip -d "test_log_${GITHUB_RUN_ID}.gz" + cat "test_log_${GITHUB_RUN_ID}" + + - name: Create Allure report + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + - name: Cleanup Test Resources + if: always() + run: | + curl -k -X 'POST' \ + "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer $API_KEY" \ + -d '' + + - name: Stop EC2 instance and wait for the instance to be stopped + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID + aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml new file mode 100644 index 0000000000..23a2e3876c --- /dev/null +++ b/.github/workflows/pg-clients.yml @@ -0,0 +1,211 @@ +name: Test Postgres client libraries + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '23 02 * * *' # run once a day, timezone is utc + pull_request: + paths: + - '.github/workflows/pg-clients.yml' + - 'test_runner/pg_clients/**' + - 'test_runner/logical_repl/**' + - 'poetry.lock' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +defaults: + run: + shell: bash -euxo pipefail {0} + +env: + DEFAULT_PG_VERSION: 16 + PLATFORM: neon-captest-new + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + AWS_DEFAULT_REGION: eu-central-1 + +jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name }} + + check-build-tools-image: + needs: [ check-permissions ] + uses: ./.github/workflows/check-build-tools-image.yml + + build-build-tools-image: + needs: [ check-build-tools-image ] + uses: ./.github/workflows/build-build-tools-image.yml + with: + image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} + secrets: inherit + + test-logical-replication: + needs: [ build-build-tools-image ] + runs-on: ubuntu-22.04 + + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init --user root + services: + clickhouse: + image: clickhouse/clickhouse-server:24.6.3.64 + ports: + - 9000:9000 + - 8123:8123 + zookeeper: + image: quay.io/debezium/zookeeper:2.7 + ports: + - 2181:2181 + kafka: + image: quay.io/debezium/kafka:2.7 + env: + ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + ports: + - 9092:9092 + debezium: + image: quay.io/debezium/connect:2.7 + env: + BOOTSTRAP_SERVERS: kafka:9092 + GROUP_ID: 1 + CONFIG_STORAGE_TOPIC: debezium-config + OFFSET_STORAGE_TOPIC: debezium-offset + STATUS_STORAGE_TOPIC: debezium-status + DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector + ports: + - 8083:8083 + steps: + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + postgres_version: ${{ env.DEFAULT_PG_VERSION }} + + - name: Run tests + uses: ./.github/actions/run-python-test-set + with: + build_type: remote + test_selection: logical_repl + run_in_parallel: false + extra_params: -m remote_cluster + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} + + - name: Delete Neon Project + if: always() + uses: ./.github/actions/neon-project-delete + with: + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + if: ${{ !cancelled() }} + id: create-allure-report + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - name: Post to a Slack channel + if: github.event.schedule && failure() + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>) + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + test-postgres-client-libs: + needs: [ build-build-tools-image ] + runs-on: ubuntu-22.04 + + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init --user root + + steps: + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + postgres_version: ${{ env.DEFAULT_PG_VERSION }} + + - name: Run tests + uses: ./.github/actions/run-python-test-set + with: + build_type: remote + test_selection: pg_clients + run_in_parallel: false + extra_params: -m remote_cluster + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} + + - name: Delete Neon Project + if: always() + uses: ./.github/actions/neon-project-delete + with: + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + if: ${{ !cancelled() }} + id: create-allure-report + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - name: Post to a Slack channel + if: github.event.schedule && failure() + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>) + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml deleted file mode 100644 index fef3aec754..0000000000 --- a/.github/workflows/pg_clients.yml +++ /dev/null @@ -1,98 +0,0 @@ -name: Test Postgres client libraries - -on: - schedule: - # * is a special character in YAML so you have to quote this string - # ┌───────────── minute (0 - 59) - # │ ┌───────────── hour (0 - 23) - # │ │ ┌───────────── day of the month (1 - 31) - # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) - # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '23 02 * * *' # run once a day, timezone is utc - - workflow_dispatch: - -concurrency: - # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} - cancel-in-progress: true - -jobs: - test-postgres-client-libs: - # TODO: switch to gen2 runner, requires docker - runs-on: ubuntu-22.04 - - env: - DEFAULT_PG_VERSION: 14 - TEST_OUTPUT: /tmp/test_output - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - uses: actions/setup-python@v4 - with: - python-version: 3.9 - - - name: Install Poetry - uses: snok/install-poetry@v1 - - - name: Cache poetry deps - uses: actions/cache@v4 - with: - path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }} - - - name: Install Python deps - shell: bash -euxo pipefail {0} - run: ./scripts/pysync - - - name: Create Neon Project - id: create-neon-project - uses: ./.github/actions/neon-project-create - with: - api_key: ${{ secrets.NEON_STAGING_API_KEY }} - postgres_version: ${{ env.DEFAULT_PG_VERSION }} - - - name: Run pytest - env: - REMOTE_ENV: 1 - BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - shell: bash -euxo pipefail {0} - run: | - # Test framework expects we have psql binary; - # but since we don't really need it in this test, let's mock it - mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql"; - ./scripts/pytest \ - --junitxml=$TEST_OUTPUT/junit.xml \ - --tb=short \ - --verbose \ - -m "remote_cluster" \ - -rA "test_runner/pg_clients" - - - name: Delete Neon Project - if: ${{ always() }} - uses: ./.github/actions/neon-project-delete - with: - project_id: ${{ steps.create-neon-project.outputs.project_id }} - api_key: ${{ secrets.NEON_STAGING_API_KEY }} - - # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI. - # It will be fixed after switching to gen2 runner - - name: Upload python test logs - if: always() - uses: actions/upload-artifact@v4 - with: - retention-days: 7 - name: python-test-pg_clients-${{ runner.os }}-stage-logs - path: ${{ env.TEST_OUTPUT }} - - - name: Post to a Slack channel - if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 - with: - channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - env: - SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index 024594532f..2e79498fc4 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -7,12 +7,20 @@ on: description: 'Source tag' required: true type: string + force: + description: 'Force the image to be pinned' + default: false + type: boolean workflow_call: inputs: from-tag: description: 'Source tag' required: true type: string + force: + description: 'Force the image to be pinned' + default: false + type: boolean defaults: run: @@ -22,15 +30,18 @@ concurrency: group: pin-build-tools-image-${{ inputs.from-tag }} cancel-in-progress: false +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} -jobs: - tag-image: - runs-on: ubuntu-22.04 +env: + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: pinned - env: - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: pinned +jobs: + check-manifests: + runs-on: ubuntu-22.04 + outputs: + skip: ${{ steps.check-manifests.outputs.skip }} steps: - name: Check if we really need to pin the image @@ -47,27 +58,44 @@ jobs: echo "skip=${skip}" | tee -a $GITHUB_OUTPUT + tag-image: + needs: check-manifests + + # use format(..) to catch both inputs.force = true AND inputs.force = 'true' + if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true' + + runs-on: ubuntu-22.04 + + permissions: + id-token: write # for `azure/login` + + steps: - uses: docker/login-action@v3 - if: steps.check-manifests.outputs.skip == 'false' + with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub - if: steps.check-manifests.outputs.skip == 'false' - run: | - docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \ - neondatabase/build-tools:${FROM_TAG} - - uses: docker/login-action@v3 - if: steps.check-manifests.outputs.skip == 'false' with: registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com username: ${{ secrets.AWS_ACCESS_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR - if: steps.check-manifests.outputs.skip == 'false' + - name: Azure login + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} + + - name: Login to ACR + run: | + az acr login --name=neoneastus2 + + - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ + -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \ + -t neondatabase/build-tools:${TO_TAG} \ neondatabase/build-tools:${FROM_TAG} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 90a3aaaf2d..56ef6f4bbb 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -52,13 +52,15 @@ jobs: env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | + TITLE="Storage & Compute release ${RELEASE_DATE}" + cat << EOF > body.md - ## Storage & Compute release ${RELEASE_DATE} + ## ${TITLE} **Please merge this Pull Request using 'Create a merge commit' button** EOF - gh pr create --title "Release ${RELEASE_DATE}" \ + gh pr create --title "${TITLE}" \ --body-file "body.md" \ --head "${RELEASE_BRANCH}" \ --base "release" @@ -91,13 +93,15 @@ jobs: env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | + TITLE="Proxy release ${RELEASE_DATE}" + cat << EOF > body.md - ## Proxy release ${RELEASE_DATE} + ## ${TITLE} **Please merge this Pull Request using 'Create a merge commit' button** EOF - gh pr create --title "Proxy release ${RELEASE_DATE}" \ + gh pr create --title "${TITLE}" \ --body-file "body.md" \ --head "${RELEASE_BRANCH}" \ --base "release-proxy" diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 77928a343e..6fbe785c56 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -13,8 +13,6 @@ defaults: env: # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} jobs: cancel-previous-e2e-tests: @@ -64,19 +62,35 @@ jobs: needs: [ tag ] runs-on: ubuntu-22.04 env: + EVENT_ACTION: ${{ github.event.action }} + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} TAG: ${{ needs.tag.outputs.build-tag }} steps: - - name: check if ecr image are present - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + - name: Wait for `promote-images` job to finish + # It's important to have a timeout here, the script in the step can run infinitely + timeout-minutes: 60 run: | - for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do - OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text) - if [ "$OUTPUT" == "" ]; then - echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT - exit 1 - fi + if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then + exit 0 + fi + + # For PRs we use the run id as the tag + BUILD_AND_TEST_RUN_ID=${TAG} + while true; do + conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion') + case "$conclusion" in + success) + break + ;; + failure | cancelled | skipped) + echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..." + exit 1 + ;; + *) + echo "The 'promote-images' hasn't succeed yet. Waiting..." + sleep 60 + ;; + esac done - name: Set e2e-platforms diff --git a/.neon_clippy_args b/.neon_clippy_args index 25e09c61a6..4db32cf35c 100644 --- a/.neon_clippy_args +++ b/.neon_clippy_args @@ -1,4 +1,5 @@ # * `-A unknown_lints` – do not warn about unknown lint suppressions # that people with newer toolchains might use # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) -export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings" +# * `-D clippy::todo` - don't let `todo!()` slip into `main` +export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings -D clippy::todo" diff --git a/CODEOWNERS b/CODEOWNERS index af2fa6088e..606dbb4e22 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,13 +1,13 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute /storage_controller @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage -/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers +/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage /libs/remote_storage/ @neondatabase/storage -/libs/safekeeper_api/ @neondatabase/safekeepers +/libs/safekeeper_api/ @neondatabase/storage /libs/vm_monitor/ @neondatabase/autoscaling /pageserver/ @neondatabase/storage /pgxn/ @neondatabase/compute -/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers +/pgxn/neon/ @neondatabase/compute @neondatabase/storage /proxy/ @neondatabase/proxy -/safekeeper/ @neondatabase/safekeepers +/safekeeper/ @neondatabase/storage /vendor/ @neondatabase/compute diff --git a/Cargo.lock b/Cargo.lock index cf8a0b3286..a506da8c02 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -261,15 +261,6 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba" -[[package]] -name = "atomic-polyfill" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289" -dependencies = [ - "critical-section", -] - [[package]] name = "atomic-take" version = "1.1.0" @@ -493,7 +484,7 @@ dependencies = [ "http 0.2.9", "http 1.1.0", "once_cell", - "p256", + "p256 0.11.1", "percent-encoding", "ring 0.17.6", "sha2", @@ -857,6 +848,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + [[package]] name = "base64" version = "0.13.1" @@ -980,9 +977,9 @@ checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" [[package]] name = "bytemuck" -version = "1.16.0" +version = "1.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5" +checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" [[package]] name = "byteorder" @@ -1014,6 +1011,9 @@ name = "camino" version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] [[package]] name = "camino-tempfile" @@ -1208,7 +1208,6 @@ dependencies = [ "serde_json", "serde_with", "utils", - "workspace_hack", ] [[package]] @@ -1233,6 +1232,7 @@ dependencies = [ "regex", "remote_storage", "reqwest 0.12.4", + "rlimit", "rust-ini", "serde", "serde_json", @@ -1243,7 +1243,7 @@ dependencies = [ "tokio-postgres", "tokio-stream", "tokio-util", - "toml_edit", + "toml_edit 0.19.10", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -1320,7 +1320,6 @@ dependencies = [ "serde", "serde_with", "utils", - "workspace_hack", ] [[package]] @@ -1359,11 +1358,12 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-util", - "toml", - "toml_edit", + "toml 0.7.4", + "toml_edit 0.19.10", "tracing", "url", "utils", + "whoami", "workspace_hack", ] @@ -1394,9 +1394,9 @@ dependencies = [ [[package]] name = "crc32c" -version = "0.6.5" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" dependencies = [ "rustc_version", ] @@ -1422,7 +1422,7 @@ dependencies = [ "clap", "criterion-plot", "is-terminal", - "itertools", + "itertools 0.10.5", "num-traits", "once_cell", "oorandom", @@ -1443,15 +1443,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", - "itertools", + "itertools 0.10.5", ] -[[package]] -name = "critical-section" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52" - [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -1536,8 +1530,10 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" dependencies = [ + "generic-array", "rand_core 0.6.4", "subtle", + "zeroize", ] [[package]] @@ -1631,6 +1627,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" dependencies = [ "const-oid", + "pem-rfc7468", "zeroize", ] @@ -1648,6 +1645,16 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", + "serde", +] + [[package]] name = "desim" version = "0.1.0" @@ -1661,17 +1668,17 @@ dependencies = [ "smallvec", "tracing", "utils", - "workspace_hack", ] [[package]] name = "diesel" -version = "2.1.4" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8" +checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b" dependencies = [ "bitflags 2.4.1", "byteorder", + "chrono", "diesel_derives", "itoa", "pq-sys", @@ -1681,11 +1688,12 @@ dependencies = [ [[package]] name = "diesel_derives" -version = "2.1.2" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44" +checksum = "59de76a222c2b8059f789cbe07afbfd8deb8c31dd0bc2a21f85e256c1def8259" dependencies = [ "diesel_table_macro_syntax", + "dsl_auto_type", "proc-macro2", "quote", "syn 2.0.52", @@ -1693,9 +1701,9 @@ dependencies = [ [[package]] name = "diesel_migrations" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac" +checksum = "8a73ce704bad4231f001bff3314d91dce4aba0770cee8b233991859abc15c1f6" dependencies = [ "diesel", "migrations_internals", @@ -1704,9 +1712,9 @@ dependencies = [ [[package]] name = "diesel_table_macro_syntax" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5" +checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ "syn 2.0.52", ] @@ -1718,6 +1726,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", + "const-oid", "crypto-common", "subtle", ] @@ -1742,6 +1751,20 @@ dependencies = [ "const-random", ] +[[package]] +name = "dsl_auto_type" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc" +dependencies = [ + "darling", + "either", + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "dyn-clone" version = "1.0.14" @@ -1755,11 +1778,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" dependencies = [ "der 0.6.1", - "elliptic-curve", - "rfc6979", + "elliptic-curve 0.12.3", + "rfc6979 0.3.1", "signature 1.6.4", ] +[[package]] +name = "ecdsa" +version = "0.16.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" +dependencies = [ + "der 0.7.8", + "digest", + "elliptic-curve 0.13.8", + "rfc6979 0.4.0", + "signature 2.2.0", + "spki 0.7.3", +] + [[package]] name = "either" version = "1.8.1" @@ -1772,16 +1809,36 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" dependencies = [ - "base16ct", + "base16ct 0.1.1", "crypto-bigint 0.4.9", "der 0.6.1", "digest", - "ff", + "ff 0.12.1", "generic-array", - "group", - "pkcs8", + "group 0.12.1", + "pkcs8 0.9.0", "rand_core 0.6.4", - "sec1", + "sec1 0.3.0", + "subtle", + "zeroize", +] + +[[package]] +name = "elliptic-curve" +version = "0.13.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct 0.2.0", + "crypto-bigint 0.5.5", + "digest", + "ff 0.13.0", + "generic-array", + "group 0.13.0", + "pem-rfc7468", + "pkcs8 0.10.2", + "rand_core 0.6.4", + "sec1 0.7.3", "subtle", "zeroize", ] @@ -1935,6 +1992,16 @@ dependencies = [ "subtle", ] +[[package]] +name = "ff" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "filetime" version = "0.2.22" @@ -1999,16 +2066,6 @@ dependencies = [ "tokio-util", ] -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "fsevent-sys" version = "4.1.0" @@ -2128,6 +2185,12 @@ dependencies = [ "slab", ] +[[package]] +name = "gen_ops" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a" + [[package]] name = "generic-array" version = "0.14.7" @@ -2136,6 +2199,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", + "zeroize", ] [[package]] @@ -2202,7 +2266,18 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" dependencies = [ - "ff", + "ff 0.12.1", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "group" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff 0.13.0", "rand_core 0.6.4", "subtle", ] @@ -2262,15 +2337,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "hash32" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606" -dependencies = [ - "byteorder", -] - [[package]] name = "hashbrown" version = "0.12.3" @@ -2319,18 +2385,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "heapless" -version = "0.8.0" -source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963be2de24804acf5e5001#644653bf3b831c6bb4963be2de24804acf5e5001" -dependencies = [ - "atomic-polyfill", - "hash32", - "rustc_version", - "spin 0.9.8", - "stable_deref_trait", -] - [[package]] name = "heck" version = "0.4.1" @@ -2364,16 +2418,6 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" -[[package]] -name = "histogram" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b" -dependencies = [ - "serde", - "thiserror", -] - [[package]] name = "hmac" version = "0.12.1" @@ -2735,17 +2779,6 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" -[[package]] -name = "io-lifetimes" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" -dependencies = [ - "hermit-abi", - "libc", - "windows-sys 0.48.0", -] - [[package]] name = "io-uring" version = "0.6.2" @@ -2764,14 +2797,13 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" [[package]] name = "is-terminal" -version = "0.4.7" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" dependencies = [ "hermit-abi", - "io-lifetimes", - "rustix 0.37.25", - "windows-sys 0.48.0", + "libc", + "windows-sys 0.52.0", ] [[package]] @@ -2783,6 +2815,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.6" @@ -2798,6 +2839,42 @@ dependencies = [ "libc", ] +[[package]] +name = "jose-b64" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec69375368709666b21c76965ce67549f2d2db7605f1f8707d17c9656801b56" +dependencies = [ + "base64ct", + "serde", + "subtle", + "zeroize", +] + +[[package]] +name = "jose-jwa" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ab78e053fe886a351d67cf0d194c000f9d0dcb92906eb34d853d7e758a4b3a7" +dependencies = [ + "serde", +] + +[[package]] +name = "jose-jwk" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280fa263807fe0782ecb6f2baadc28dffc04e00558a58e33bfdb801d11fd58e7" +dependencies = [ + "jose-b64", + "jose-jwa", + "p256 0.13.2", + "p384", + "rsa", + "serde", + "zeroize", +] + [[package]] name = "js-sys" version = "0.3.69" @@ -2857,6 +2934,9 @@ name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +dependencies = [ + "spin 0.5.2", +] [[package]] name = "lazycell" @@ -2897,18 +2977,6 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" -[[package]] -name = "linux-raw-sys" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" - -[[package]] -name = "linux-raw-sys" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" - [[package]] name = "linux-raw-sys" version = "0.4.13" @@ -2990,9 +3058,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "measured" -version = "0.0.21" +version = "0.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5" +checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0" dependencies = [ "bytes", "crossbeam-utils", @@ -3008,9 +3076,9 @@ dependencies = [ [[package]] name = "measured-derive" -version = "0.0.21" +version = "0.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d" +checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -3020,13 +3088,13 @@ dependencies = [ [[package]] name = "measured-process" -version = "0.0.21" +version = "0.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000" +checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec" dependencies = [ "libc", "measured", - "procfs 0.16.0", + "procfs", ] [[package]] @@ -3071,29 +3139,28 @@ dependencies = [ "measured", "measured-process", "once_cell", - "procfs 0.14.2", + "procfs", "prometheus", "rand 0.8.5", "rand_distr", "twox-hash", - "workspace_hack", ] [[package]] name = "migrations_internals" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada" +checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff" dependencies = [ "serde", - "toml", + "toml 0.8.14", ] [[package]] name = "migrations_macros" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08" +checksum = "ffb161cc72176cb37aa47f1fc520d3ef02263d67d661f44f05d05a079e1237fd" dependencies = [ "migrations_internals", "proc-macro2", @@ -3214,16 +3281,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "nu-ansi-term" -version = "0.46.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" -dependencies = [ - "overload", - "winapi", -] - [[package]] name = "num" version = "0.4.1" @@ -3248,6 +3305,23 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand 0.8.5", + "smallvec", + "zeroize", +] + [[package]] name = "num-complex" version = "0.4.4" @@ -3257,6 +3331,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-integer" version = "0.1.45" @@ -3513,23 +3593,39 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" -[[package]] -name = "overload" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" - [[package]] name = "p256" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" dependencies = [ - "ecdsa", - "elliptic-curve", + "ecdsa 0.14.8", + "elliptic-curve 0.12.3", "sha2", ] +[[package]] +name = "p256" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b" +dependencies = [ + "ecdsa 0.16.9", + "elliptic-curve 0.13.8", + "primeorder", + "sha2", +] + +[[package]] +name = "p384" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209" +dependencies = [ + "elliptic-curve 0.13.8", + "primeorder", +] + [[package]] name = "pagebench" version = "0.1.0" @@ -3573,7 +3669,7 @@ dependencies = [ "thiserror", "tokio", "tokio-util", - "toml_edit", + "toml_edit 0.19.10", "utils", "workspace_hack", ] @@ -3610,7 +3706,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper 0.14.26", - "itertools", + "itertools 0.10.5", "leaky-bucket", "md5", "metrics", @@ -3628,8 +3724,9 @@ dependencies = [ "postgres_connection", "postgres_ffi", "pq_proto", - "procfs 0.14.2", + "procfs", "rand 0.8.5", + "range-set-blaze", "regex", "remote_storage", "reqwest 0.12.4", @@ -3649,6 +3746,7 @@ dependencies = [ "sysinfo", "tenant_size_model", "thiserror", + "tikv-jemallocator", "tokio", "tokio-epoll-uring", "tokio-io-timeout", @@ -3656,7 +3754,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit", + "toml_edit 0.19.10", "tracing", "twox-hash", "url", @@ -3679,7 +3777,7 @@ dependencies = [ "hex", "humantime", "humantime-serde", - "itertools", + "itertools 0.10.5", "postgres_ffi", "rand 0.8.5", "serde", @@ -3689,7 +3787,6 @@ dependencies = [ "strum_macros", "thiserror", "utils", - "workspace_hack", ] [[package]] @@ -3737,7 +3834,7 @@ dependencies = [ "hex-literal", "humantime", "humantime-serde", - "itertools", + "itertools 0.10.5", "metrics", "once_cell", "pageserver_api", @@ -3889,6 +3986,15 @@ dependencies = [ "serde", ] +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.2.0" @@ -3955,6 +4061,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der 0.7.8", + "pkcs8 0.10.2", + "spki 0.7.3", +] + [[package]] name = "pkcs8" version = "0.9.0" @@ -3965,6 +4082,16 @@ dependencies = [ "spki 0.6.0", ] +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der 0.7.8", + "spki 0.7.3", +] + [[package]] name = "pkg-config" version = "0.3.27" @@ -4059,8 +4186,8 @@ dependencies = [ "tokio-postgres", "tokio-postgres-rustls", "tokio-rustls 0.25.0", + "tokio-util", "tracing", - "workspace_hack", ] [[package]] @@ -4068,12 +4195,11 @@ name = "postgres_connection" version = "0.1.0" dependencies = [ "anyhow", - "itertools", + "itertools 0.10.5", "once_cell", "postgres", "tokio-postgres", "url", - "workspace_hack", ] [[package]] @@ -4096,9 +4222,14 @@ dependencies = [ "serde", "thiserror", "utils", - "workspace_hack", ] +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -4120,7 +4251,7 @@ version = "0.1.0" dependencies = [ "byteorder", "bytes", - "itertools", + "itertools 0.10.5", "pin-project-lite", "postgres-protocol", "rand 0.8.5", @@ -4128,7 +4259,6 @@ dependencies = [ "thiserror", "tokio", "tracing", - "workspace_hack", ] [[package]] @@ -4151,6 +4281,15 @@ dependencies = [ "syn 2.0.52", ] +[[package]] +name = "primeorder" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6" +dependencies = [ + "elliptic-curve 0.13.8", +] + [[package]] name = "proc-macro-hack" version = "0.5.20+deprecated" @@ -4166,21 +4305,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "procfs" -version = "0.14.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69" -dependencies = [ - "bitflags 1.3.2", - "byteorder", - "chrono", - "flate2", - "hex", - "lazy_static", - "rustix 0.36.16", -] - [[package]] name = "procfs" version = "0.16.0" @@ -4188,10 +4312,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" dependencies = [ "bitflags 2.4.1", + "chrono", + "flate2", "hex", "lazy_static", "procfs-core", - "rustix 0.38.28", + "rustix", ] [[package]] @@ -4201,14 +4327,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" dependencies = [ "bitflags 2.4.1", + "chrono", "hex", ] [[package]] name = "prometheus" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" dependencies = [ "cfg-if", "fnv", @@ -4216,7 +4343,7 @@ dependencies = [ "libc", "memchr", "parking_lot 0.12.1", - "procfs 0.14.2", + "procfs", "thiserror", ] @@ -4238,7 +4365,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", "heck 0.4.1", - "itertools", + "itertools 0.10.5", "lazy_static", "log", "multimap", @@ -4259,7 +4386,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" dependencies = [ "anyhow", - "itertools", + "itertools 0.10.5", "proc-macro2", "quote", "syn 1.0.109", @@ -4280,6 +4407,7 @@ version = "0.1.0" dependencies = [ "ahash", "anyhow", + "arc-swap", "async-compression", "async-trait", "atomic-take", @@ -4297,6 +4425,7 @@ dependencies = [ "consumption_metrics", "crossbeam-deque", "dashmap", + "ecdsa 0.16.9", "env_logger", "fallible-iterator", "framed-websockets", @@ -4316,13 +4445,16 @@ dependencies = [ "hyper-util", "indexmap 2.0.1", "ipnet", - "itertools", + "itertools 0.10.5", + "jose-jwa", + "jose-jwk", "lasso", "md5", "measured", "metrics", "once_cell", "opentelemetry", + "p256 0.13.2", "parking_lot 0.12.1", "parquet", "parquet_derive", @@ -4343,6 +4475,7 @@ dependencies = [ "reqwest-retry", "reqwest-tracing", "routerify", + "rsa", "rstest", "rustc-hash", "rustls 0.22.4", @@ -4352,6 +4485,7 @@ dependencies = [ "serde", "serde_json", "sha2", + "signature 2.2.0", "smallvec", "smol_str", "socket2 0.5.5", @@ -4371,6 +4505,8 @@ dependencies = [ "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", + "try-lock", + "typed-json", "url", "urlencoding", "utils", @@ -4491,6 +4627,18 @@ dependencies = [ "rand_core 0.5.1", ] +[[package]] +name = "range-set-blaze" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2" +dependencies = [ + "gen_ops", + "itertools 0.12.1", + "num-integer", + "num-traits", +] + [[package]] name = "rayon" version = "1.7.0" @@ -4569,6 +4717,15 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "regex" version = "1.10.2" @@ -4630,6 +4787,7 @@ name = "remote_storage" version = "0.1.0" dependencies = [ "anyhow", + "async-stream", "async-trait", "aws-config", "aws-credential-types", @@ -4647,8 +4805,9 @@ dependencies = [ "futures-util", "http-types", "humantime", + "humantime-serde", "hyper 0.14.26", - "itertools", + "itertools 0.10.5", "metrics", "once_cell", "pin-project-lite", @@ -4661,10 +4820,9 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "toml_edit", + "toml_edit 0.19.10", "tracing", "utils", - "workspace_hack", ] [[package]] @@ -4829,6 +4987,16 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rfc6979" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" +dependencies = [ + "hmac", + "subtle", +] + [[package]] name = "ring" version = "0.16.20" @@ -4858,6 +5026,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "rlimit" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8" +dependencies = [ + "libc", +] + [[package]] name = "routerify" version = "3.0.0" @@ -4880,6 +5057,26 @@ dependencies = [ "archery", ] +[[package]] +name = "rsa" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8 0.10.2", + "rand_core 0.6.4", + "signature 2.2.0", + "spki 0.7.3", + "subtle", + "zeroize", +] + [[package]] name = "rstest" version = "0.18.2" @@ -4949,34 +5146,6 @@ dependencies = [ "nom", ] -[[package]] -name = "rustix" -version = "0.36.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab" -dependencies = [ - "bitflags 1.3.2", - "errno", - "io-lifetimes", - "libc", - "linux-raw-sys 0.1.4", - "windows-sys 0.45.0", -] - -[[package]] -name = "rustix" -version = "0.37.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035" -dependencies = [ - "bitflags 1.3.2", - "errno", - "io-lifetimes", - "libc", - "linux-raw-sys 0.3.8", - "windows-sys 0.48.0", -] - [[package]] name = "rustix" version = "0.38.28" @@ -5126,7 +5295,6 @@ dependencies = [ "crc32c", "desim", "fail", - "fs2", "futures", "git-version", "hex", @@ -5153,6 +5321,8 @@ dependencies = [ "sha2", "signal-hook", "storage_broker", + "strum", + "strum_macros", "thiserror", "tokio", "tokio-io-timeout", @@ -5160,7 +5330,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit", + "toml_edit 0.19.10", "tracing", "tracing-subscriber", "url", @@ -5177,7 +5347,6 @@ dependencies = [ "serde", "serde_with", "utils", - "workspace_hack", ] [[package]] @@ -5235,10 +5404,24 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" dependencies = [ - "base16ct", + "base16ct 0.1.1", "der 0.6.1", "generic-array", - "pkcs8", + "pkcs8 0.9.0", + "subtle", + "zeroize", +] + +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct 0.2.0", + "der 0.7.8", + "generic-array", + "pkcs8 0.10.2", "subtle", "zeroize", ] @@ -5377,9 +5560,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.183" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c" +checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" dependencies = [ "serde_derive", ] @@ -5396,9 +5579,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.183" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" +checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", @@ -5439,9 +5622,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.2" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93107647184f6027e3b7dcb2e11034cf95ffa1e3a682c67951963ac69c1c007d" +checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0" dependencies = [ "serde", ] @@ -5585,6 +5768,7 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" dependencies = [ + "digest", "rand_core 0.6.4", ] @@ -5661,9 +5845,6 @@ name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -dependencies = [ - "lock_api", -] [[package]] name = "spki" @@ -5685,12 +5866,6 @@ dependencies = [ "der 0.7.8", ] -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - [[package]] name = "static_assertions" version = "1.1.0" @@ -5733,6 +5908,7 @@ dependencies = [ "aws-config", "bytes", "camino", + "chrono", "clap", "control_plane", "diesel", @@ -5743,7 +5919,7 @@ dependencies = [ "hex", "humantime", "hyper 0.14.26", - "itertools", + "itertools 0.10.5", "lasso", "measured", "metrics", @@ -5752,6 +5928,7 @@ dependencies = [ "pageserver_client", "postgres_connection", "r2d2", + "rand 0.8.5", "reqwest 0.12.4", "routerify", "scopeguard", @@ -5767,6 +5944,28 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "storage_controller_client" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bytes", + "futures", + "pageserver_api", + "pageserver_client", + "postgres", + "reqwest 0.12.4", + "serde", + "thiserror", + "tokio", + "tokio-postgres", + "tokio-stream", + "tokio-util", + "utils", + "workspace_hack", +] + [[package]] name = "storage_scrubber" version = "0.1.0" @@ -5785,10 +5984,10 @@ dependencies = [ "either", "futures", "futures-util", + "git-version", "hex", - "histogram", "humantime", - "itertools", + "itertools 0.10.5", "once_cell", "pageserver", "pageserver_api", @@ -5801,6 +6000,7 @@ dependencies = [ "serde", "serde_json", "serde_with", + "storage_controller_client", "thiserror", "tokio", "tokio-postgres", @@ -5830,6 +6030,7 @@ dependencies = [ "reqwest 0.12.4", "serde", "serde_json", + "storage_controller_client", "thiserror", "tokio", "tracing", @@ -5963,15 +6164,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.5.0" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" dependencies = [ "cfg-if", - "fastrand 1.9.0", - "redox_syscall 0.3.5", - "rustix 0.37.25", - "windows-sys 0.45.0", + "fastrand 2.0.0", + "redox_syscall 0.4.1", + "rustix", + "windows-sys 0.52.0", ] [[package]] @@ -5981,7 +6182,6 @@ dependencies = [ "anyhow", "serde", "serde_json", - "workspace_hack", ] [[package]] @@ -6088,12 +6288,15 @@ dependencies = [ [[package]] name = "time" -version = "0.3.21" +version = "0.3.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc" +checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" dependencies = [ + "deranged", "itoa", "js-sys", + "num-conv", + "powerfmt", "serde", "time-core", "time-macros", @@ -6101,16 +6304,17 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.9" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b" +checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" dependencies = [ + "num-conv", "time-core", ] @@ -6326,14 +6530,26 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit", + "toml_edit 0.19.10", +] + +[[package]] +name = "toml" +version = "0.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit 0.22.14", ] [[package]] name = "toml_datetime" -version = "0.6.2" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a76a9312f5ba4c2dec6b9161fdf25d87ad8a09256ccea5a556fef03c706a10f" +checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" dependencies = [ "serde", ] @@ -6348,7 +6564,20 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "winnow", + "winnow 0.4.6", +] + +[[package]] +name = "toml_edit" +version = "0.22.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38" +dependencies = [ + "indexmap 2.0.1", + "serde", + "serde_spanned", + "toml_datetime", + "winnow 0.6.13", ] [[package]] @@ -6428,17 +6657,6 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" -[[package]] -name = "trace" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap", - "pageserver_api", - "utils", - "workspace_hack", -] - [[package]] name = "tracing" version = "0.1.37" @@ -6538,7 +6756,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", - "nu-ansi-term", "once_cell", "regex", "serde", @@ -6565,14 +6782,13 @@ dependencies = [ "tracing", "tracing-opentelemetry", "tracing-subscriber", - "workspace_hack", ] [[package]] name = "try-lock" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "tungstenite" @@ -6603,6 +6819,16 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "typed-json" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "typenum" version = "1.16.0" @@ -6737,7 +6963,6 @@ dependencies = [ "criterion", "fail", "futures", - "heapless", "hex", "hex-literal", "humantime", @@ -6767,13 +6992,13 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", + "toml_edit 0.19.10", "tracing", "tracing-error", "tracing-subscriber", "url", "uuid", "walkdir", - "workspace_hack", ] [[package]] @@ -6852,7 +7077,6 @@ dependencies = [ "postgres_ffi", "regex", "utils", - "workspace_hack", ] [[package]] @@ -6873,7 +7097,6 @@ dependencies = [ "bindgen", "postgres_ffi", "utils", - "workspace_hack", ] [[package]] @@ -6898,6 +7121,12 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + [[package]] name = "wasm-bindgen" version = "0.2.92" @@ -7050,6 +7279,17 @@ dependencies = [ "once_cell", ] +[[package]] +name = "whoami" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9" +dependencies = [ + "redox_syscall 0.4.1", + "wasite", + "web-sys", +] + [[package]] name = "winapi" version = "0.3.9" @@ -7124,15 +7364,6 @@ dependencies = [ "windows_x86_64_msvc 0.42.2", ] -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets 0.42.2", -] - [[package]] name = "windows-sys" version = "0.48.0" @@ -7151,21 +7382,6 @@ dependencies = [ "windows-targets 0.52.4", ] -[[package]] -name = "windows-targets" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" -dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", -] - [[package]] name = "windows-targets" version = "0.48.0" @@ -7331,6 +7547,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "winnow" +version = "0.6.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1" +dependencies = [ + "memchr", +] + [[package]] name = "winreg" version = "0.50.0" @@ -7367,26 +7592,31 @@ dependencies = [ "base64 0.21.1", "base64ct", "bytes", + "camino", "cc", "chrono", "clap", "clap_builder", "crossbeam-utils", + "crypto-bigint 0.5.5", + "der 0.7.8", + "deranged", + "digest", "either", "fail", "futures-channel", - "futures-core", "futures-executor", "futures-io", - "futures-sink", "futures-util", + "generic-array", "getrandom 0.2.11", "hashbrown 0.14.5", "hex", "hmac", "hyper 0.14.26", "indexmap 1.9.3", - "itertools", + "itertools 0.10.5", + "lazy_static", "libc", "log", "memchr", @@ -7396,7 +7626,9 @@ dependencies = [ "num-traits", "once_cell", "parquet", + "proc-macro2", "prost", + "quote", "rand 0.8.5", "regex", "regex-automata 0.4.3", @@ -7408,18 +7640,19 @@ dependencies = [ "serde", "serde_json", "sha2", + "signature 2.2.0", "smallvec", + "spki 0.7.3", "subtle", "syn 1.0.109", "syn 2.0.52", "sync_wrapper", + "tikv-jemalloc-sys", "time", "time-macros", "tokio", "tokio-rustls 0.24.0", "tokio-util", - "toml_datetime", - "toml_edit", "tonic", "tower", "tracing", @@ -7518,6 +7751,7 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" dependencies = [ + "serde", "zeroize_derive", ] diff --git a/Cargo.toml b/Cargo.toml index 8fddaaef12..963841e340 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,9 +13,9 @@ members = [ "safekeeper", "storage_broker", "storage_controller", + "storage_controller/client", "storage_scrubber", "workspace_hack", - "trace", "libs/compute_api", "libs/pageserver_api", "libs/postgres_ffi", @@ -84,7 +84,6 @@ enumset = "1.0.12" fail = "0.5.0" fallible-iterator = "0.2" framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" } -fs2 = "0.4.3" futures = "0.3" futures-core = "0.3" futures-util = "0.3" @@ -111,8 +110,8 @@ lasso = "0.7" leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" -measured = { version = "0.0.21", features=["lasso"] } -measured-process = { version = "0.0.21" } +measured = { version = "0.0.22", features=["lasso"] } +measured-process = { version = "0.0.22" } memoffset = "0.8" nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } notify = "6.0.0" @@ -127,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] } parquet_derive = "51.0.0" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" -procfs = "0.14" +procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.11" rand = "0.8" @@ -184,14 +183,17 @@ tower-service = "0.3.2" tracing = "0.1" tracing-error = "0.2.0" tracing-opentelemetry = "0.21.0" -tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] } +tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } +try-lock = "0.2.5" twox-hash = { version = "1.6.3", default-features = false } +typed-json = "0.1" url = "2.2" urlencoding = "2.1" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" rustls-native-certs = "0.7" x509-parser = "0.15" +whoami = "1.5.1" ## TODO replace this with tracing env_logger = "0.10" @@ -203,9 +205,6 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -## Other git libraries -heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending - ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } @@ -221,6 +220,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } desim = { version = "0.1", path = "./libs/desim" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. +storage_controller_client = { path = "./storage_controller/client" } tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" } tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" } utils = { version = "0.1", path = "./libs/utils/" } diff --git a/Dockerfile b/Dockerfile index b4900d4a94..d3d12330c6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh -ENV BUILD_TYPE release +ENV BUILD_TYPE=release RUN set -e \ && mold -run make -j $(nproc) -s neon-pg-ext \ && rm -rf pg_install/build \ @@ -29,25 +29,15 @@ WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG -# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. -# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. -# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build -ARG RUSTC_WRAPPER=cachepot -ENV AWS_REGION=eu-central-1 -ENV CACHEPOT_S3_KEY_PREFIX=cachepot -ARG CACHEPOT_BUCKET=neon-github-dev -#ARG AWS_ACCESS_KEY_ID -#ARG AWS_SECRET_ACCESS_KEY - COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib COPY --chown=nonroot . . -# Show build caching stats to check if it was used in the end. -# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. +ARG ADDITIONAL_RUSTFLAGS RUN set -e \ - && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \ + && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ @@ -56,8 +46,8 @@ RUN set -e \ --bin storage_controller \ --bin proxy \ --bin neon_local \ - --locked --release \ - && cachepot -s + --bin storage_scrubber \ + --locked --release # Build final image # @@ -82,6 +72,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ @@ -90,20 +81,24 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. -RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ - && /usr/local/bin/pageserver -D /data/.neon/ --init \ - -c "id=1234" \ - -c "broker_endpoint='http://storage_broker:50051'" \ - -c "pg_distrib_dir='/usr/local/'" \ - -c "listen_pg_addr='0.0.0.0:6400'" \ - -c "listen_http_addr='0.0.0.0:9898'" +RUN mkdir -p /data/.neon/ && \ + echo "id=1234" > "/data/.neon/identity.toml" && \ + echo "broker_endpoint='http://storage_broker:50051'\n" \ + "pg_distrib_dir='/usr/local/'\n" \ + "listen_pg_addr='0.0.0.0:6400'\n" \ + "listen_http_addr='0.0.0.0:9898'\n" \ + > /data/.neon/pageserver.toml && \ + chown -R neon:neon /data/.neon # When running a binary that links with libpq, default to using our most recent postgres version. Binaries # that want a particular postgres version will select it explicitly: this is just a default. -ENV LD_LIBRARY_PATH /usr/local/v16/lib +ENV LD_LIBRARY_PATH=/usr/local/v16/lib VOLUME ["/data"] USER neon EXPOSE 6400 EXPOSE 9898 + +CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"] + diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 5dd2c13c0e..d6beb61369 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -1,5 +1,13 @@ FROM debian:bullseye-slim +# Use ARG as a build-time environment variable here to allow. +# It's not supposed to be set outside. +# Alternatively it can be obtained using the following command +# ``` +# . /etc/os-release && echo "${VERSION_CODENAME}" +# ``` +ARG DEBIAN_VERSION_CODENAME=bullseye + # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home SHELL ["/bin/bash", "-c"] @@ -26,7 +34,6 @@ RUN set -e \ liblzma-dev \ libncurses5-dev \ libncursesw5-dev \ - libpq-dev \ libreadline-dev \ libseccomp-dev \ libsqlite3-dev \ @@ -51,7 +58,7 @@ RUN set -e \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* # protobuf-compiler (protoc) -ENV PROTOC_VERSION 25.1 +ENV PROTOC_VERSION=25.1 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ && unzip -q protoc.zip -d protoc \ && mv protoc/bin/protoc /usr/local/bin/protoc \ @@ -67,19 +74,24 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/ # LLVM ENV LLVM_VERSION=18 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ - && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ + && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -# PostgreSQL 14 -RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \ - && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \ +# Install docker +RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \ && apt update \ - && apt install -y postgresql-client-14 \ + && apt install -y docker-ce docker-ce-cli \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +# Configure sudo & docker +RUN usermod -aG sudo nonroot && \ + echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \ + usermod -aG docker nonroot + # AWS CLI RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \ && unzip -q awscliv2.zip \ @@ -87,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws && rm awscliv2.zip # Mold: A Modern Linker -ENV MOLD_VERSION v2.31.0 +ENV MOLD_VERSION=v2.33.0 RUN set -e \ && git clone https://github.com/rui314/mold.git \ && mkdir mold/build \ @@ -113,10 +125,10 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS && rm -rf ../lcov.tar.gz # Compile and install the static OpenSSL library -ENV OPENSSL_VERSION=3.2.2 +ENV OPENSSL_VERSION=1.1.1w ENV OPENSSL_PREFIX=/usr/local/openssl RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \ - echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \ + echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \ cd /tmp && \ tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ @@ -156,7 +168,7 @@ USER nonroot:nonroot WORKDIR /home/nonroot # Python -ENV PYTHON_VERSION=3.9.18 \ +ENV PYTHON_VERSION=3.9.19 \ PYENV_ROOT=/home/nonroot/.pyenv \ PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH RUN set -e \ @@ -180,9 +192,14 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.79.0 +ENV RUSTC_VERSION=1.80.1 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" +ARG RUSTFILT_VERSION=0.2.1 +ARG CARGO_HAKARI_VERSION=0.9.30 +ARG CARGO_DENY_VERSION=0.16.1 +ARG CARGO_HACK_VERSION=0.6.31 +ARG CARGO_NEXTEST_VERSION=0.9.72 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ chmod +x rustup-init && \ ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ @@ -191,15 +208,13 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux . "$HOME/.cargo/env" && \ cargo --version && rustup --version && \ rustup component add llvm-tools-preview rustfmt clippy && \ - cargo install --git https://github.com/paritytech/cachepot && \ - cargo install rustfilt && \ - cargo install cargo-hakari && \ - cargo install cargo-deny --locked && \ - cargo install cargo-hack && \ - cargo install cargo-nextest && \ + cargo install rustfilt --version ${RUSTFILT_VERSION} && \ + cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \ + cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ + cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \ + cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \ rm -rf /home/nonroot/.cargo/registry && \ rm -rf /home/nonroot/.cargo/git -ENV RUSTC_WRAPPER=cachepot # Show versions RUN whoami \ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 3a73ac71b0..7acaf2f2fd 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -94,7 +94,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ make clean && cp -R /sfcgal/* / -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ @@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz FROM build-deps AS rum-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY patches/rum.patch /rum.patch + RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ + patch -p1 < /rum.patch && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control @@ -408,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ @@ -441,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in \ "v14") \ @@ -467,31 +470,6 @@ RUN case "${PG_VERSION}" in \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control -######################################################################################### -# -# Layer "kq-imcx-pg-build" -# compile kq_imcx extension -# -######################################################################################### -FROM build-deps AS kq-imcx-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - -ENV PATH "/usr/local/pgsql/bin/:$PATH" -RUN apt-get update && \ - apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \ - wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \ - echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \ - mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ - mkdir build && cd build && \ - cmake -DCMAKE_BUILD_TYPE=Release .. && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ - mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \ - sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \ - comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T - ######################################################################################### # @@ -502,7 +480,7 @@ RUN apt-get update && \ FROM build-deps AS pg-cron-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ @@ -528,7 +506,7 @@ RUN apt-get update && \ libboost-system1.74-dev \ libeigen3-dev -ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ @@ -568,7 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. FROM build-deps AS pg-uuidv7-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ @@ -585,7 +563,7 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz FROM build-deps AS pg-roaringbitmap-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ @@ -602,7 +580,7 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 FROM build-deps AS pg-semver-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ @@ -620,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ @@ -644,7 +622,7 @@ RUN case "${PG_VERSION}" in \ FROM build-deps AS pg-anon-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ @@ -679,7 +657,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ - cargo install --locked --version 0.10.2 cargo-pgrx && \ + cargo install --locked --version 0.11.3 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root @@ -694,10 +672,15 @@ USER root FROM rust-extensions-build AS pg-jsonschema-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \ - echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ + echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 + # `unsafe-postgres` feature allows to build pgx extensions + # against postgres forks that decided to change their ABI name (like us). + # With that we can build extensions without forking them and using stock + # pgx. As this feature is new few manual version bumps were required. + sed -i 's/pgrx = "0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control @@ -711,10 +694,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar. FROM rust-extensions-build AS pg-graphql-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \ - echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ + echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ # it's needed to enable extension because it uses untrusted C language sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ @@ -734,6 +717,9 @@ ARG PG_VERSION RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ + # TODO update pgrx version in the pg_tiktoken repo and remove this line + sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \ + sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control @@ -747,14 +733,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6 FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION -RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \ - echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ + echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ - echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \ - wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ - patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ - echo "********************************************************************************************************" && \ - sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control @@ -768,7 +750,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz - FROM build-deps AS wal2json-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ @@ -784,7 +766,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. FROM build-deps AS pg-ivm-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ @@ -801,7 +783,7 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv FROM build-deps AS pg-partman-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ @@ -840,7 +822,6 @@ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -952,7 +933,8 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src -#COPY --from=rum-pg-build /rum.tar.gz /ext-src +COPY --from=rum-pg-build /rum.tar.gz /ext-src +COPY patches/rum.patch /ext-src #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src COPY --from=prefix-pg-build /prefix.tar.gz /ext-src @@ -961,11 +943,10 @@ COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src COPY patches/pg_hintplan.patch /ext-src -#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src COPY patches/pg_cron.patch /ext-src #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src -COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src +#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src @@ -980,6 +961,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \ rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ || exit 1; rm -f $f; done RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch +RUN cd /ext-src/rum-src && patch -p1 <../rum.patch # cmake is required for the h3 test RUN apt-get update && apt-get install -y cmake RUN patch -p1 < /ext-src/pg_hintplan.patch @@ -1052,6 +1034,6 @@ RUN apt update && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 -ENV LANG en_US.utf8 +ENV LANG=en_US.utf8 USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/Makefile b/Makefile index 942867d81a..de298303e3 100644 --- a/Makefile +++ b/Makefile @@ -69,6 +69,8 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 # Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel) CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib +CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55" + # # Top level Makefile to build Neon and PostgreSQL # @@ -79,15 +81,24 @@ all: neon postgres neon-pg-ext # # The 'postgres_ffi' depends on the Postgres headers. .PHONY: neon -neon: postgres-headers walproposer-lib +neon: postgres-headers walproposer-lib cargo-target-dir +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) +.PHONY: cargo-target-dir +cargo-target-dir: + # https://github.com/rust-lang/cargo/issues/14281 + mkdir -p target + test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG ### PostgreSQL parts # Some rules are duplicated for Postgres v14 and 15. We may want to refactor # to avoid the duplication in the future, but it's tolerable for now. # $(POSTGRES_INSTALL_DIR)/build/%/config.status: + + mkdir -p $(POSTGRES_INSTALL_DIR) + test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG + +@echo "Configuring Postgres $* build" @test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \ echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \ diff --git a/README.md b/README.md index ea0a289502..f01442da5d 100644 --- a/README.md +++ b/README.md @@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v testing locally, it is convenient to run just one set of permutations, like this: ```sh -DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest +DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest ``` ## Flamegraphs diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8f96530a9d..8af0ed43ce 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -4,6 +4,11 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +default = [] +# Enables test specific features. +testing = [] + [dependencies] anyhow.workspace = true async-compression.workspace = true @@ -44,3 +49,4 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" } zstd = "0.13" bytes = "1.0" rust-ini = "0.20.0" +rlimit = "0.10.1" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 7bf5db5a57..0ba2c1aeb4 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -6,7 +6,7 @@ //! - Every start is a fresh start, so the data directory is removed and //! initialized again on each run. //! - If remote_extension_config is provided, it will be used to fetch extensions list -//! and download `shared_preload_libraries` from the remote storage. +//! and download `shared_preload_libraries` from the remote storage. //! - Next it will put configuration files into the `PGDATA` directory. //! - Sync safekeepers and get commit LSN. //! - Get `basebackup` from pageserver using the returned on the previous step LSN. @@ -33,7 +33,6 @@ //! -b /usr/local/bin/postgres \ //! -r http://pg-ext-s3-gateway \ //! ``` -//! use std::collections::HashMap; use std::fs::File; use std::path::Path; @@ -64,6 +63,7 @@ use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; use compute_tools::swap::resize_swap; +use rlimit::{setrlimit, Resource}; // this is an arbitrary build tag. Fine as a default / for testing purposes // in-case of not-set environment var @@ -72,6 +72,9 @@ const BUILD_TAG_DEFAULT: &str = "latest"; fn main() -> Result<()> { let (build_tag, clap_args) = init()?; + // enable core dumping for all child processes + setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; + let (pg_handle, start_pg_result) = { // Enter startup tracing context let _startup_context_guard = startup_context_from_env(); diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index a79b666409..5bd6897fe3 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -56,6 +56,7 @@ pub struct ComputeNode { /// - we push new spec and it does reconfiguration /// - but then something happens and compute pod / VM is destroyed, /// so k8s controller starts it again with the **old** spec + /// /// and the same for empty computes: /// - we started compute without any spec /// - we push spec and it does configuration @@ -399,7 +400,15 @@ impl ComputeNode { pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let mut retry_period_ms = 500.0; let mut attempts = 0; - let max_attempts = 10; + const DEFAULT_ATTEMPTS: u16 = 10; + #[cfg(feature = "testing")] + let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") { + u16::from_str(&v).unwrap() + } else { + DEFAULT_ATTEMPTS + }; + #[cfg(not(feature = "testing"))] + let max_attempts = DEFAULT_ATTEMPTS; loop { let result = self.try_get_basebackup(compute_state, lsn); match result { @@ -798,7 +807,11 @@ impl ComputeNode { // In this case we need to connect with old `zenith_admin` name // and create new user. We cannot simply rename connected user, // but we can create a new one and grant it all privileges. - let connstr = self.connstr.clone(); + let mut connstr = self.connstr.clone(); + connstr + .query_pairs_mut() + .append_pair("application_name", "apply_config"); + let mut client = match Client::connect(connstr.as_str(), NoTls) { Err(e) => match e.code() { Some(&SqlState::INVALID_PASSWORD) @@ -867,15 +880,19 @@ impl ComputeNode { // Run migrations separately to not hold up cold starts thread::spawn(move || { + let mut connstr = connstr.clone(); + connstr + .query_pairs_mut() + .append_pair("application_name", "migrations"); + let mut client = Client::connect(connstr.as_str(), NoTls)?; handle_migrations(&mut client).context("apply_config handle_migrations") }); Ok(()) } - // We could've wrapped this around `pg_ctl reload`, but right now we don't use - // `pg_ctl` for start / stop, so this just seems much easier to do as we already - // have opened connection to Postgres and superuser access. + // Wrapped this around `pg_ctl reload`, but right now we don't use + // `pg_ctl` for start / stop. #[instrument(skip_all)] fn pg_reload_conf(&self) -> Result<()> { let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl"); @@ -1108,7 +1125,7 @@ impl ComputeNode { // EKS worker nodes have following core dump settings: // /proc/sys/kernel/core_pattern -> core // /proc/sys/kernel/core_uses_pid -> 1 - // ulimint -c -> unlimited + // ulimit -c -> unlimited // which results in core dumps being written to postgres data directory as core.. // // Use that as a default location and pattern, except macos where core dumps are written @@ -1387,7 +1404,9 @@ pub fn forward_termination_signal() { let pg_pid = PG_PID.load(Ordering::SeqCst); if pg_pid != 0 { let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); - // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html - kill(pg_pid, Signal::SIGQUIT).ok(); + // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for + // ROs to get a list of running xacts faster instead of going through the CLOG. + // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals. + kill(pg_pid, Signal::SIGINT).ok(); } } diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 2c4aec4116..479100eb89 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -83,12 +83,6 @@ pub fn write_postgres_conf( ComputeMode::Replica => { // hot_standby is 'on' by default, but let's be explicit writeln!(file, "hot_standby=on")?; - - // Inform the replica about the primary state - // Default is 'false' - if let Some(primary_is_running) = spec.primary_is_running { - writeln!(file, "neon.primary_is_running={}", primary_is_running)?; - } } } diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 18c228ba54..543d4462ed 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -11,6 +11,7 @@ pub mod logger; pub mod catalog; pub mod compute; pub mod extension_server; +mod migration; pub mod monitor; pub mod params; pub mod pg_helpers; diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs new file mode 100644 index 0000000000..22ab145eda --- /dev/null +++ b/compute_tools/src/migration.rs @@ -0,0 +1,105 @@ +use anyhow::{Context, Result}; +use postgres::Client; +use tracing::info; + +pub(crate) struct MigrationRunner<'m> { + client: &'m mut Client, + migrations: &'m [&'m str], +} + +impl<'m> MigrationRunner<'m> { + pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self { + // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64 + assert!(migrations.len() + 1 < i64::MAX as usize); + + Self { client, migrations } + } + + fn get_migration_id(&mut self) -> Result { + let query = "SELECT id FROM neon_migration.migration_id"; + let row = self + .client + .query_one(query, &[]) + .context("run_migrations get migration_id")?; + + Ok(row.get::<&str, i64>("id")) + } + + fn update_migration_id(&mut self, migration_id: i64) -> Result<()> { + let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id); + + self.client + .simple_query(&setval) + .context("run_migrations update id")?; + + Ok(()) + } + + fn prepare_migrations(&mut self) -> Result<()> { + let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; + self.client.simple_query(query)?; + + let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; + self.client.simple_query(query)?; + + let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; + self.client.simple_query(query)?; + + let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; + self.client.simple_query(query)?; + + let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; + self.client.simple_query(query)?; + + Ok(()) + } + + pub fn run_migrations(mut self) -> Result<()> { + self.prepare_migrations()?; + + let mut current_migration = self.get_migration_id()? as usize; + while current_migration < self.migrations.len() { + macro_rules! migration_id { + ($cm:expr) => { + ($cm + 1) as i64 + }; + } + + let migration = self.migrations[current_migration]; + + if migration.starts_with("-- SKIP") { + info!("Skipping migration id={}", migration_id!(current_migration)); + } else { + info!( + "Running migration id={}:\n{}\n", + migration_id!(current_migration), + migration + ); + + self.client + .simple_query("BEGIN") + .context("begin migration")?; + + self.client.simple_query(migration).with_context(|| { + format!( + "run_migrations migration id={}", + migration_id!(current_migration) + ) + })?; + + // Migration IDs start at 1 + self.update_migration_id(migration_id!(current_migration))?; + + self.client + .simple_query("COMMIT") + .context("commit migration")?; + + info!("Finished migration id={}", migration_id!(current_migration)); + } + + current_migration += 1; + } + + Ok(()) + } +} diff --git a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql similarity index 100% rename from compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql rename to compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql diff --git a/compute_tools/src/migrations/0001-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql similarity index 100% rename from compute_tools/src/migrations/0001-alter_roles.sql rename to compute_tools/src/migrations/0002-alter_roles.sql diff --git a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql rename to compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql rename to compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql rename to compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql rename to compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql similarity index 100% rename from compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql rename to compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql diff --git a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql similarity index 100% rename from compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql rename to compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql diff --git a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql similarity index 100% rename from compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql rename to compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql diff --git a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql new file mode 100644 index 0000000000..28750e00dd --- /dev/null +++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql @@ -0,0 +1,7 @@ +DO $$ +BEGIN + IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser'; + EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser'; + END IF; +END $$; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 872a3f7750..d7127aac32 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -17,7 +17,11 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); // should be handled gracefully. fn watch_compute_activity(compute: &ComputeNode) { // Suppose that `connstr` doesn't change - let connstr = compute.connstr.as_str(); + let mut connstr = compute.connstr.clone(); + connstr + .query_pairs_mut() + .append_pair("application_name", "compute_activity_monitor"); + let connstr = connstr.as_str(); // During startup and configuration we connect to every Postgres database, // but we don't want to count this as some user activity. So wait until diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index fa0822748b..863fa9468f 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> /// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions: /// - next line starts with timestamp /// - EOF -/// - no new lines were written for the last second +/// - no new lines were written for the last 100 milliseconds async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> { let mut lines = tokio::io::BufReader::new(stderr).lines(); let timeout_duration = Duration::from_millis(100); diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 143f6c1e5f..6a87263821 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -10,6 +10,7 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level}; use crate::config; use crate::logger::inlinify; +use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; @@ -776,84 +777,25 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { // Add new migrations in numerical order. let migrations = [ - include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"), - include_str!("./migrations/0001-alter_roles.sql"), - include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"), - include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"), - include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"), - include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"), + include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"), + include_str!("./migrations/0002-alter_roles.sql"), + include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"), + include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"), + include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"), + include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"), include_str!( - "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql" + "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql" ), include_str!( - "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql" + "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql" + ), + include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"), + include_str!( + "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql" ), - include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"), ]; - let mut func = || { - let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; - client.simple_query(query)?; - - let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; - client.simple_query(query)?; - - let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; - client.simple_query(query)?; - - let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; - client.simple_query(query)?; - - let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; - client.simple_query(query)?; - Ok::<_, anyhow::Error>(()) - }; - func().context("handle_migrations prepare")?; - - let query = "SELECT id FROM neon_migration.migration_id"; - let row = client - .query_one(query, &[]) - .context("handle_migrations get migration_id")?; - let mut current_migration: usize = row.get::<&str, i64>("id") as usize; - let starting_migration_id = current_migration; - - let query = "BEGIN"; - client - .simple_query(query) - .context("handle_migrations begin")?; - - while current_migration < migrations.len() { - let migration = &migrations[current_migration]; - if migration.starts_with("-- SKIP") { - info!("Skipping migration id={}", current_migration); - } else { - info!( - "Running migration id={}:\n{}\n", - current_migration, migration - ); - client.simple_query(migration).with_context(|| { - format!("handle_migrations current_migration={}", current_migration) - })?; - } - current_migration += 1; - } - let setval = format!( - "UPDATE neon_migration.migration_id SET id={}", - migrations.len() - ); - client - .simple_query(&setval) - .context("handle_migrations update id")?; - - let query = "COMMIT"; - client - .simple_query(query) - .context("handle_migrations commit")?; - - info!( - "Ran {} migrations", - (migrations.len() - starting_migration_id) - ); + MigrationRunner::new(client, &migrations).run_migrations()?; Ok(()) } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index e62f3b8a47..487ac8f047 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -40,6 +40,7 @@ safekeeper_api.workspace = true postgres_connection.workspace = true storage_broker.workspace = true utils.workspace = true +whoami.workspace = true compute_api.workspace = true workspace_hack.workspace = true diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 3f4ddbdb2b..619c5bce3e 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -36,11 +36,11 @@ use utils::pid_file::{self, PidFileRead}; // it's waiting. If the process hasn't started/stopped after 5 seconds, // it prints a notice that it's taking long, but keeps waiting. // -const RETRY_UNTIL_SECS: u64 = 10; -const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS; -const RETRY_INTERVAL_MILLIS: u64 = 100; -const DOT_EVERY_RETRIES: u64 = 10; -const NOTICE_AFTER_RETRIES: u64 = 50; +const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10); +const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis(); +const RETRY_INTERVAL: Duration = Duration::from_millis(100); +const DOT_EVERY_RETRIES: u128 = 10; +const NOTICE_AFTER_RETRIES: u128 = 50; /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates /// it itself. @@ -52,6 +52,7 @@ pub enum InitialPidFile { } /// Start a background child process using the parameters given. +#[allow(clippy::too_many_arguments)] pub async fn start_process( process_name: &str, datadir: &Path, @@ -59,6 +60,7 @@ pub async fn start_process( args: AI, envs: EI, initial_pid_file: InitialPidFile, + retry_timeout: &Duration, process_status_check: F, ) -> anyhow::Result<()> where @@ -69,6 +71,7 @@ where // Not generic AsRef, otherwise empty `envs` prevents type inference EI: IntoIterator, { + let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis(); if !datadir.metadata().context("stat datadir")?.is_dir() { anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}"); } @@ -130,7 +133,7 @@ where .unwrap(); }); - for retries in 0..RETRIES { + for retries in 0..retries { match process_started(pid, pid_file_to_check, &process_status_check).await { Ok(true) => { println!("\n{process_name} started and passed status check, pid: {pid}"); @@ -148,7 +151,7 @@ where print!("."); io::stdout().flush().unwrap(); } - thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS)); + thread::sleep(RETRY_INTERVAL); } Err(e) => { println!("error starting process {process_name:?}: {e:#}"); @@ -157,9 +160,10 @@ where } } println!(); - anyhow::bail!( - "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds" - ); + anyhow::bail!(format!( + "{} did not start+pass status checks within {:?} seconds", + process_name, retry_timeout + )); } /// Stops the process, using the pid file given. Returns Ok also if the process is already not running. @@ -215,7 +219,7 @@ pub fn stop_process( } pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> { - for retries in 0..RETRIES { + for retries in 0..STOP_RETRIES { match process_has_stopped(pid) { Ok(true) => { println!("\n{process_name} stopped"); @@ -231,7 +235,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> { print!("."); io::stdout().flush().unwrap(); } - thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS)); + thread::sleep(RETRY_INTERVAL); } Err(e) => { println!("{process_name} with pid {pid} failed to stop: {e:#}"); @@ -240,7 +244,10 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> { } } println!(); - anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds"); + anyhow::bail!(format!( + "{} with pid {} did not stop in {:?} seconds", + process_name, pid, STOP_RETRY_TIMEOUT + )); } fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { @@ -282,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { for (var, val) in std::env::vars() { - if var.starts_with("NEON_PAGESERVER_") { + if var.starts_with("NEON_") { cmd = cmd.env(var, val); } } @@ -372,7 +379,7 @@ where } } -fn process_has_stopped(pid: Pid) -> anyhow::Result { +pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result { match kill(pid, None) { // Process exists, keep waiting Ok(_) => Ok(false), diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 8fe959792b..1d66532d49 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -15,16 +15,18 @@ use control_plane::local_env::{ }; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; -use control_plane::storage_controller::StorageController; +use control_plane::storage_controller::{ + NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, +}; use control_plane::{broker, local_env}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, }; -use pageserver_api::controller_api::PlacementPolicy; -use pageserver_api::models::{ - ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, +use pageserver_api::controller_api::{ + NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest, }; +use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo}; use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; @@ -36,6 +38,7 @@ use std::collections::{BTreeSet, HashMap}; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; +use std::time::Duration; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; use url::Host; use utils::{ @@ -51,7 +54,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); -const DEFAULT_PG_VERSION: &str = "15"; +const DEFAULT_PG_VERSION: &str = "16"; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; @@ -99,7 +102,7 @@ fn main() -> Result<()> { let subcommand_result = match sub_name { "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), - "start" => rt.block_on(handle_start_all(&env)), + "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))), "stop" => rt.block_on(handle_stop_all(sub_args, &env)), "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)), @@ -599,13 +602,9 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local Some(("import", import_match)) => { let tenant_id = get_tenant_id(import_match, env)?; let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided"); - let name = import_match - .get_one::("node-name") - .ok_or_else(|| anyhow!("No node name provided"))?; - let update_catalog = import_match - .get_one::("update-catalog") - .cloned() - .unwrap_or_default(); + let branch_name = import_match + .get_one::("branch-name") + .ok_or_else(|| anyhow!("No branch name provided"))?; // Parse base inputs let base_tarfile = import_match @@ -632,24 +631,11 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local .copied() .context("Failed to parse postgres version from the argument string")?; - let mut cplane = ComputeControlPlane::load(env.clone())?; println!("Importing timeline into pageserver ..."); pageserver .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version) .await?; - env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; - - println!("Creating endpoint for imported timeline ..."); - cplane.new_endpoint( - name, - tenant_id, - timeline_id, - None, - None, - pg_version, - ComputeMode::Primary, - !update_catalog, - )?; + env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?; println!("Done"); } Some(("branch", branch_match)) => { @@ -864,20 +850,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re let allow_multiple = sub_args.get_flag("allow-multiple"); - // If --safekeepers argument is given, use only the listed safekeeper nodes. - let safekeepers = - if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { - let mut safekeepers: Vec = Vec::new(); - for sk_id in safekeepers_str.split(',').map(str::trim) { - let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| { - anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list") - })?); - safekeepers.push(sk_id); - } - safekeepers - } else { - env.safekeepers.iter().map(|sk| sk.id).collect() - }; + // If --safekeepers argument is given, use only the listed + // safekeeper nodes; otherwise all from the env. + let safekeepers = if let Some(safekeepers) = parse_safekeepers(sub_args)? { + safekeepers + } else { + env.safekeepers.iter().map(|sk| sk.id).collect() + }; let endpoint = cplane .endpoints @@ -981,7 +960,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re }) .collect::>() }; - endpoint.reconfigure(pageservers, None).await?; + // If --safekeepers argument is given, use only the listed + // safekeeper nodes; otherwise all from the env. + let safekeepers = parse_safekeepers(sub_args)?; + endpoint.reconfigure(pageservers, None, safekeepers).await?; } "stop" => { let endpoint_id = sub_args @@ -1003,6 +985,23 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re Ok(()) } +/// Parse --safekeepers as list of safekeeper ids. +fn parse_safekeepers(sub_args: &ArgMatches) -> Result>> { + if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { + let mut safekeepers: Vec = Vec::new(); + for sk_id in safekeepers_str.split(',').map(str::trim) { + let sk_id = NodeId( + u64::from_str(sk_id) + .map_err(|_| anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list"))?, + ); + safekeepers.push(sk_id); + } + Ok(Some(safekeepers)) + } else { + Ok(None) + } +} + fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { let (sub_name, sub_args) = match sub_match.subcommand() { Some(ep_subcommand_data) => ep_subcommand_data, @@ -1048,10 +1047,50 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result &Duration { + let humantime_duration = args + .get_one::("start-timeout") + .expect("invalid value for start-timeout"); + humantime_duration.as_ref() +} + +fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs { + let maybe_instance_id = args.get_one::("instance-id"); + + let base_port = args.get_one::("base-port"); + + if maybe_instance_id.is_some() && base_port.is_none() { + panic!("storage-controller start specificied instance-id but did not provide base-port"); + } + + let start_timeout = args + .get_one::("start-timeout") + .expect("invalid value for start-timeout"); + + NeonStorageControllerStartArgs { + instance_id: maybe_instance_id.copied().unwrap_or(1), + base_port: base_port.copied(), + start_timeout: *start_timeout, + } +} + +fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs { + let maybe_instance_id = args.get_one::("instance-id"); + let immediate = args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); + + NeonStorageControllerStopArgs { + instance_id: maybe_instance_id.copied().unwrap_or(1), + immediate, + } +} + async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { match sub_match.subcommand() { Some(("start", subcommand_args)) => { - if let Err(e) = get_pageserver(env, subcommand_args)?.start().await { + if let Err(e) = get_pageserver(env, subcommand_args)? + .start(get_start_timeout(subcommand_args)) + .await + { eprintln!("pageserver start failed: {e}"); exit(1); } @@ -1077,7 +1116,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> exit(1); } - if let Err(e) = pageserver.start().await { + if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await { eprintln!("pageserver start failed: {e}"); exit(1); } @@ -1105,20 +1144,15 @@ async fn handle_storage_controller( ) -> Result<()> { let svc = StorageController::from_env(env); match sub_match.subcommand() { - Some(("start", _start_match)) => { - if let Err(e) = svc.start().await { + Some(("start", start_match)) => { + if let Err(e) = svc.start(storage_controller_start_args(start_match)).await { eprintln!("start failed: {e}"); exit(1); } } Some(("stop", stop_match)) => { - let immediate = stop_match - .get_one::("stop-mode") - .map(|s| s.as_str()) - == Some("immediate"); - - if let Err(e) = svc.stop(immediate).await { + if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await { eprintln!("stop failed: {}", e); exit(1); } @@ -1165,7 +1199,10 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> "start" => { let extra_opts = safekeeper_extra_opts(sub_args); - if let Err(e) = safekeeper.start(extra_opts).await { + if let Err(e) = safekeeper + .start(extra_opts, get_start_timeout(sub_args)) + .await + { eprintln!("safekeeper start failed: {}", e); exit(1); } @@ -1191,7 +1228,10 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> } let extra_opts = safekeeper_extra_opts(sub_args); - if let Err(e) = safekeeper.start(extra_opts).await { + if let Err(e) = safekeeper + .start(extra_opts, get_start_timeout(sub_args)) + .await + { eprintln!("safekeeper start failed: {}", e); exit(1); } @@ -1204,15 +1244,23 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Ok(()) } -async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> { +async fn handle_start_all( + env: &local_env::LocalEnv, + retry_timeout: &Duration, +) -> anyhow::Result<()> { // Endpoints are not started automatically - broker::start_broker_process(env).await?; + broker::start_broker_process(env, retry_timeout).await?; // Only start the storage controller if the pageserver is configured to need it if env.control_plane_api.is_some() { let storage_controller = StorageController::from_env(env); - if let Err(e) = storage_controller.start().await { + if let Err(e) = storage_controller + .start(NeonStorageControllerStartArgs::with_default_instance_id( + (*retry_timeout).into(), + )) + .await + { eprintln!("storage_controller start failed: {:#}", e); try_stop_all(env, true).await; exit(1); @@ -1221,7 +1269,7 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> { for ps_conf in &env.pageservers { let pageserver = PageServerNode::from_env(env, ps_conf); - if let Err(e) = pageserver.start().await { + if let Err(e) = pageserver.start(retry_timeout).await { eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); try_stop_all(env, true).await; exit(1); @@ -1230,15 +1278,76 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> { for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); - if let Err(e) = safekeeper.start(vec![]).await { + if let Err(e) = safekeeper.start(vec![], retry_timeout).await { eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e); try_stop_all(env, false).await; exit(1); } } + + neon_start_status_check(env, retry_timeout).await?; + Ok(()) } +async fn neon_start_status_check( + env: &local_env::LocalEnv, + retry_timeout: &Duration, +) -> anyhow::Result<()> { + const RETRY_INTERVAL: Duration = Duration::from_millis(100); + const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5); + + if env.control_plane_api.is_none() { + return Ok(()); + } + + let storcon = StorageController::from_env(env); + + let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis(); + let notice_after_retries = retry_timeout.as_millis() / NOTICE_AFTER_RETRIES.as_millis(); + + println!("\nRunning neon status check"); + + for retry in 0..retries { + if retry == notice_after_retries { + println!("\nNeon status check has not passed yet, continuing to wait") + } + + let mut passed = true; + let mut nodes = storcon.node_list().await?; + let mut pageservers = env.pageservers.clone(); + + if nodes.len() != pageservers.len() { + continue; + } + + nodes.sort_by_key(|ps| ps.id); + pageservers.sort_by_key(|ps| ps.id); + + for (idx, pageserver) in pageservers.iter().enumerate() { + let node = &nodes[idx]; + if node.id != pageserver.id { + passed = false; + break; + } + + if !matches!(node.availability, NodeAvailabilityWrapper::Active) { + passed = false; + break; + } + } + + if passed { + println!("\nNeon started and passed status check"); + return Ok(()); + } + + tokio::time::sleep(RETRY_INTERVAL).await; + } + + anyhow::bail!("\nNeon passed status check") +} + async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let immediate = sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); @@ -1281,15 +1390,35 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { eprintln!("neon broker stop failed: {e:#}"); } - if env.control_plane_api.is_some() { + // Stop all storage controller instances. In the most common case there's only one, + // but iterate though the base data directory in order to discover the instances. + let storcon_instances = env + .storage_controller_instances() + .await + .expect("Must inspect data dir"); + for (instance_id, _instance_dir_path) in storcon_instances { let storage_controller = StorageController::from_env(env); - if let Err(e) = storage_controller.stop(immediate).await { - eprintln!("storage controller stop failed: {e:#}"); + let stop_args = NeonStorageControllerStopArgs { + instance_id, + immediate, + }; + + if let Err(e) = storage_controller.stop(stop_args).await { + eprintln!("Storage controller instance {instance_id} stop failed: {e:#}"); } } } fn cli() -> Command { + let timeout_arg = Arg::new("start-timeout") + .long("start-timeout") + .short('t') + .global(true) + .help("timeout until we fail the command, e.g. 30s") + .value_parser(value_parser!(humantime::Duration)) + .default_value("10s") + .required(false); + let branch_name_arg = Arg::new("branch-name") .long("branch-name") .help("Name of the branch to be created or used as an alias for other services") @@ -1415,6 +1544,18 @@ fn cli() -> Command { .action(ArgAction::SetTrue) .required(false); + let instance_id = Arg::new("instance-id") + .long("instance-id") + .help("Identifier used to distinguish storage controller instances (default 1)") + .value_parser(value_parser!(u8)) + .required(false); + + let base_port = Arg::new("base-port") + .long("base-port") + .help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)") + .value_parser(value_parser!(u16)) + .required(false); + Command::new("Neon CLI") .arg_required_else_help(true) .version(GIT_VERSION) @@ -1458,8 +1599,7 @@ fn cli() -> Command { .about("Import timeline from basebackup directory") .arg(tenant_id_arg.clone()) .arg(timeline_id_arg.clone()) - .arg(Arg::new("node-name").long("node-name") - .help("Name to assign to the imported timeline")) + .arg(branch_name_arg.clone()) .arg(Arg::new("base-tarfile") .long("base-tarfile") .value_parser(value_parser!(PathBuf)) @@ -1475,7 +1615,6 @@ fn cli() -> Command { .arg(Arg::new("end-lsn").long("end-lsn") .help("Lsn the basebackup ends at")) .arg(pg_version_arg.clone()) - .arg(update_catalog.clone()) ) ).subcommand( Command::new("tenant") @@ -1509,6 +1648,7 @@ fn cli() -> Command { .subcommand(Command::new("status")) .subcommand(Command::new("start") .about("Start local pageserver") + .arg(timeout_arg.clone()) ) .subcommand(Command::new("stop") .about("Stop local pageserver") @@ -1516,15 +1656,20 @@ fn cli() -> Command { ) .subcommand(Command::new("restart") .about("Restart local pageserver") + .arg(timeout_arg.clone()) ) ) .subcommand( Command::new("storage_controller") .arg_required_else_help(true) .about("Manage storage_controller") - .subcommand(Command::new("start").about("Start storage controller")) + .subcommand(Command::new("start").about("Start storage controller") + .arg(timeout_arg.clone()) + .arg(instance_id.clone()) + .arg(base_port)) .subcommand(Command::new("stop").about("Stop storage controller") - .arg(stop_mode_arg.clone())) + .arg(stop_mode_arg.clone()) + .arg(instance_id)) ) .subcommand( Command::new("safekeeper") @@ -1534,6 +1679,7 @@ fn cli() -> Command { .about("Start local safekeeper") .arg(safekeeper_id_arg.clone()) .arg(safekeeper_extra_opt_arg.clone()) + .arg(timeout_arg.clone()) ) .subcommand(Command::new("stop") .about("Stop local safekeeper") @@ -1545,6 +1691,7 @@ fn cli() -> Command { .arg(safekeeper_id_arg) .arg(stop_mode_arg.clone()) .arg(safekeeper_extra_opt_arg) + .arg(timeout_arg.clone()) ) ) .subcommand( @@ -1575,14 +1722,16 @@ fn cli() -> Command { .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") .arg(endpoint_id_arg.clone()) .arg(endpoint_pageserver_id_arg.clone()) - .arg(safekeepers_arg) + .arg(safekeepers_arg.clone()) .arg(remote_ext_config_args) .arg(create_test_user) .arg(allow_multiple.clone()) + .arg(timeout_arg.clone()) ) .subcommand(Command::new("reconfigure") .about("Reconfigure the endpoint") .arg(endpoint_pageserver_id_arg) + .arg(safekeepers_arg) .arg(endpoint_id_arg.clone()) .arg(tenant_id_arg.clone()) ) @@ -1630,6 +1779,7 @@ fn cli() -> Command { .subcommand( Command::new("start") .about("Start page server and safekeepers") + .arg(timeout_arg.clone()) ) .subcommand( Command::new("stop") diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index f40705863b..c8ac5d8981 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -1,17 +1,22 @@ //! Code to manage the storage broker //! -//! In the local test environment, the data for each safekeeper is stored in +//! In the local test environment, the storage broker stores its data directly in //! //! ```text -//! .neon/safekeepers/ +//! .neon //! ``` +use std::time::Duration; + use anyhow::Context; use camino::Utf8PathBuf; use crate::{background_process, local_env}; -pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { +pub async fn start_broker_process( + env: &local_env::LocalEnv, + retry_timeout: &Duration, +) -> anyhow::Result<()> { let broker = &env.broker; let listen_addr = &broker.listen_addr; @@ -27,6 +32,7 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<( args, [], background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)), + retry_timeout, || async { let url = broker.client_url(); let status_url = url.join("status").with_context(|| { diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 20371e1cb8..9f879c4b08 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -499,6 +499,23 @@ impl Endpoint { .join(",") } + /// Map safekeepers ids to the actual connection strings. + fn build_safekeepers_connstrs(&self, sk_ids: Vec) -> Result> { + let mut safekeeper_connstrings = Vec::new(); + if self.mode == ComputeMode::Primary { + for sk_id in sk_ids { + let sk = self + .env + .safekeepers + .iter() + .find(|node| node.id == sk_id) + .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?; + safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port())); + } + } + Ok(safekeeper_connstrings) + } + pub async fn start( &self, auth_token: &Option, @@ -523,18 +540,7 @@ impl Endpoint { let pageserver_connstring = Self::build_pageserver_connstr(&pageservers); assert!(!pageserver_connstring.is_empty()); - let mut safekeeper_connstrings = Vec::new(); - if self.mode == ComputeMode::Primary { - for sk_id in safekeepers { - let sk = self - .env - .safekeepers - .iter() - .find(|node| node.id == sk_id) - .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?; - safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port())); - } - } + let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?; // check for file remote_extensions_spec.json // if it is present, read it and pass to compute_ctl @@ -592,7 +598,6 @@ impl Endpoint { remote_extensions, pgbouncer_settings: None, shard_stripe_size: Some(shard_stripe_size), - primary_is_running: None, }; let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; @@ -741,6 +746,7 @@ impl Endpoint { &self, mut pageservers: Vec<(Host, u16)>, stripe_size: Option, + safekeepers: Option>, ) -> Result<()> { let mut spec: ComputeSpec = { let spec_path = self.endpoint_path().join("spec.json"); @@ -775,6 +781,12 @@ impl Endpoint { spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); } + // If safekeepers are not specified, don't change them. + if let Some(safekeepers) = safekeepers { + let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?; + spec.safekeeper_connstrings = safekeeper_connstrings; + } + let client = reqwest::Client::builder() .timeout(Duration::from_secs(30)) .build() @@ -812,11 +824,12 @@ impl Endpoint { // cleanup work to do after postgres stops, like syncing safekeepers, // etc. // - // If destroying, send it SIGTERM before waiting. Sometimes we do *not* - // want this cleanup: tests intentionally do stop when majority of - // safekeepers is down, so sync-safekeepers would hang otherwise. This - // could be a separate flag though. - self.wait_for_compute_ctl_to_exit(destroy)?; + // If destroying or stop mode is immediate, send it SIGTERM before + // waiting. Sometimes we do *not* want this cleanup: tests intentionally + // do stop when majority of safekeepers is down, so sync-safekeepers + // would hang otherwise. This could be a separate flag though. + let send_sigterm = destroy || mode == "immediate"; + self.wait_for_compute_ctl_to_exit(send_sigterm)?; if destroy { println!( "Destroying postgres data directory '{}'", diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 6634274d2a..74caba2b56 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode; use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR; use crate::safekeeper::SafekeeperNode; -pub const DEFAULT_PG_VERSION: u32 = 15; +pub const DEFAULT_PG_VERSION: u32 = 16; // // This data structures represents neon_local CLI config @@ -151,23 +151,38 @@ pub struct NeonBroker { pub struct NeonStorageControllerConf { /// Heartbeat timeout before marking a node offline #[serde(with = "humantime_serde")] - pub max_unavailable: Duration, + pub max_offline: Duration, + + #[serde(with = "humantime_serde")] + pub max_warming_up: Duration, + + pub start_as_candidate: bool, + + /// Database url used when running multiple storage controller instances + pub database_url: Option, /// Threshold for auto-splitting a tenant into shards pub split_threshold: Option, + + pub max_secondary_lag_bytes: Option, } impl NeonStorageControllerConf { // Use a shorter pageserver unavailability interval than the default to speed up tests. - const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = - std::time::Duration::from_secs(10); + const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10); + + const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30); } impl Default for NeonStorageControllerConf { fn default() -> Self { Self { - max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL, + max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL, + max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL, + start_as_candidate: false, + database_url: None, split_threshold: None, + max_secondary_lag_bytes: None, } } } @@ -325,11 +340,16 @@ impl LocalEnv { } } - pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { - Ok(self.pg_distrib_dir(pg_version)?.join("bin")) + pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result { + Ok(self.pg_distrib_dir(pg_version)?.join(dir_name)) } + + pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { + self.pg_dir(pg_version, "bin") + } + pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { - Ok(self.pg_distrib_dir(pg_version)?.join("lib")) + self.pg_dir(pg_version, "lib") } pub fn pageserver_bin(&self) -> PathBuf { @@ -379,6 +399,36 @@ impl LocalEnv { } } + /// Inspect the base data directory and extract the instance id and instance directory path + /// for all storage controller instances + pub async fn storage_controller_instances(&self) -> std::io::Result> { + let mut instances = Vec::default(); + + let dir = std::fs::read_dir(self.base_data_dir.clone())?; + for dentry in dir { + let dentry = dentry?; + let is_dir = dentry.metadata()?.is_dir(); + let filename = dentry.file_name().into_string().unwrap(); + let parsed_instance_id = match filename.strip_prefix("storage_controller_") { + Some(suffix) => suffix.parse::().ok(), + None => None, + }; + + let is_instance_dir = is_dir && parsed_instance_id.is_some(); + + if !is_instance_dir { + continue; + } + + instances.push(( + parsed_instance_id.expect("Checked previously"), + dentry.path(), + )); + } + + Ok(instances) + } + pub fn register_branch_mapping( &mut self, branch_name: String, @@ -504,7 +554,6 @@ impl LocalEnv { #[derive(serde::Serialize, serde::Deserialize)] // (allow unknown fields, unlike PageServerConf) struct PageserverConfigTomlSubset { - id: NodeId, listen_pg_addr: String, listen_http_addr: String, pg_auth_type: AuthType, @@ -516,18 +565,30 @@ impl LocalEnv { .with_context(|| format!("read {:?}", config_toml_path))?, ) .context("parse pageserver.toml")?; + let identity_toml_path = dentry.path().join("identity.toml"); + #[derive(serde::Serialize, serde::Deserialize)] + struct IdentityTomlSubset { + id: NodeId, + } + let identity_toml: IdentityTomlSubset = toml_edit::de::from_str( + &std::fs::read_to_string(&identity_toml_path) + .with_context(|| format!("read {:?}", identity_toml_path))?, + ) + .context("parse identity.toml")?; let PageserverConfigTomlSubset { - id: config_toml_id, listen_pg_addr, listen_http_addr, pg_auth_type, http_auth_type, } = config_toml; + let IdentityTomlSubset { + id: identity_toml_id, + } = identity_toml; let conf = PageServerConf { id: { anyhow::ensure!( - config_toml_id == id, - "id mismatch: config_toml.id={config_toml_id} id={id}", + identity_toml_id == id, + "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}", ); id }, diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 13e684da24..399b1c2653 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -1,8 +1,10 @@ //! Code to manage pageservers //! -//! In the local test environment, the pageserver stores its data directly in +//! In the local test environment, the data for each pageserver is stored in //! -//! .neon/ +//! ```text +//! .neon/pageserver_ +//! ``` //! use std::collections::HashMap; @@ -15,16 +17,15 @@ use std::time::Duration; use anyhow::{bail, Context}; use camino::Utf8PathBuf; -use futures::SinkExt; use pageserver_api::models::{ - self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, - TimelineInfo, + self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo, }; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use postgres_backend::AuthType; use postgres_connection::{parse_host_port, PgConnectionConfig}; use utils::auth::{Claims, Scope}; +use utils::id::NodeId; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, @@ -74,6 +75,10 @@ impl PageServerNode { } } + fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document { + toml_edit::Document::from_str(&format!("id={node_id}")).unwrap() + } + fn pageserver_init_make_toml( &self, conf: NeonLocalInitPageserverConf, @@ -122,10 +127,13 @@ impl PageServerNode { } // Apply the user-provided overrides - overrides.push( - toml_edit::ser::to_string_pretty(&conf) - .expect("we deserialized this from toml earlier"), - ); + overrides.push({ + let mut doc = + toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier"); + // `id` is written out to `identity.toml` instead of `pageserver.toml` + doc.remove("id").expect("it's part of the struct"); + doc.to_string() + }); // Turn `overrides` into a toml document. // TODO: above code is legacy code, it should be refactored to use toml_edit directly. @@ -158,8 +166,8 @@ impl PageServerNode { .expect("non-Unicode path") } - pub async fn start(&self) -> anyhow::Result<()> { - self.start_node().await + pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> { + self.start_node(retry_timeout).await } fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { @@ -186,6 +194,19 @@ impl PageServerNode { .write_all(config.to_string().as_bytes()) .context("write pageserver toml")?; drop(config_file); + + let identity_file_path = datadir.join("identity.toml"); + let mut identity_file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(identity_file_path) + .with_context(|| format!("open identity toml for write: {config_file_path:?}"))?; + let identity_toml = self.pageserver_make_identity_toml(node_id); + identity_file + .write_all(identity_toml.to_string().as_bytes()) + .context("write identity toml")?; + drop(identity_toml); + // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config // Write metadata file, used by pageserver on startup to register itself with @@ -214,14 +235,15 @@ impl PageServerNode { Ok(()) } - async fn start_node(&self) -> anyhow::Result<()> { + async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> { // TODO: using a thread here because start_process() is not async but we need to call check_status() let datadir = self.repo_path(); print!( - "Starting pageserver node {} at '{}' in {:?}", + "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}", self.conf.id, self.pg_connection_config.raw_address(), - datadir + datadir, + retry_timeout ); io::stdout().flush().context("flush stdout")?; @@ -239,6 +261,7 @@ impl PageServerNode { args, self.pageserver_env_variables()?, background_process::InitialPidFile::Expect(self.pid_file()), + retry_timeout, || async { let st = self.check_status().await; match st { @@ -349,11 +372,6 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, - trace_read_requests: settings - .remove("trace_read_requests") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'trace_read_requests' as bool")?, eviction_policy: settings .remove("eviction_policy") .map(serde_json::from_str) @@ -395,28 +413,6 @@ impl PageServerNode { } } - pub async fn tenant_create( - &self, - new_tenant_id: TenantId, - generation: Option, - settings: HashMap<&str, &str>, - ) -> anyhow::Result { - let config = Self::parse_config(settings.clone())?; - - let request = models::TenantCreateRequest { - new_tenant_id: TenantShardId::unsharded(new_tenant_id), - generation, - config, - shard_parameters: ShardParameters::default(), - // Placement policy is not meaningful for creations not done via storage controller - placement_policy: None, - }; - if !settings.is_empty() { - bail!("Unrecognized tenant settings: {settings:?}") - } - Ok(self.http_client.tenant_create(&request).await?) - } - pub async fn tenant_config( &self, tenant_id: TenantId, @@ -476,11 +472,6 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, - trace_read_requests: settings - .remove("trace_read_requests") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'trace_read_requests' as bool")?, eviction_policy: settings .remove("eviction_policy") .map(serde_json::from_str) @@ -587,60 +578,39 @@ impl PageServerNode { pg_wal: Option<(Lsn, PathBuf)>, pg_version: u32, ) -> anyhow::Result<()> { - let (client, conn) = self.page_server_psql_client().await?; - // The connection object performs the actual communication with the database, - // so spawn it off to run on its own. - tokio::spawn(async move { - if let Err(e) = conn.await { - eprintln!("connection error: {}", e); - } - }); - let client = std::pin::pin!(client); - // Init base reader let (start_lsn, base_tarfile_path) = base; let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?; - let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile); + let base_tarfile = + mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile)); // Init wal reader if necessary let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal { let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?; - let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile); + let wal_reader = + mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile)); (end_lsn, Some(wal_reader)) } else { (start_lsn, None) }; - let copy_in = |reader, cmd| { - let client = &client; - async move { - let writer = client.copy_in(&cmd).await?; - let writer = std::pin::pin!(writer); - let mut writer = writer.sink_map_err(|e| { - std::io::Error::new(std::io::ErrorKind::Other, format!("{e}")) - }); - let mut reader = std::pin::pin!(reader); - writer.send_all(&mut reader).await?; - writer.into_inner().finish().await?; - anyhow::Ok(()) - } - }; - // Import base - copy_in( - base_tarfile, - format!( - "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" - ), - ) - .await?; - // Import wal if necessary - if let Some(wal_reader) = wal_reader { - copy_in( - wal_reader, - format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"), + self.http_client + .import_basebackup( + tenant_id, + timeline_id, + start_lsn, + end_lsn, + pg_version, + base_tarfile, ) .await?; + + // Import wal if necessary + if let Some(wal_reader) = wal_reader { + self.http_client + .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader) + .await?; } Ok(()) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 4a320ce53d..a0a73f5609 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -7,6 +7,7 @@ //! ``` use std::io::Write; use std::path::PathBuf; +use std::time::Duration; use std::{io, result}; use anyhow::Context; @@ -111,11 +112,16 @@ impl SafekeeperNode { .expect("non-Unicode path") } - pub async fn start(&self, extra_opts: Vec) -> anyhow::Result<()> { + pub async fn start( + &self, + extra_opts: Vec, + retry_timeout: &Duration, + ) -> anyhow::Result<()> { print!( - "Starting safekeeper at '{}' in '{}'", + "Starting safekeeper at '{}' in '{}', retrying for {:?}", self.pg_connection_config.raw_address(), - self.datadir_path().display() + self.datadir_path().display(), + retry_timeout, ); io::stdout().flush().unwrap(); @@ -200,6 +206,7 @@ impl SafekeeperNode { &args, self.safekeeper_env_variables()?, background_process::InitialPidFile::Expect(self.pid_file()), + retry_timeout, || async { match self.check_status().await { Ok(()) => Ok(true), diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 4f9f0ba794..27d8e2de0c 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -3,14 +3,16 @@ use crate::{ local_env::{LocalEnv, NeonStorageControllerConf}, }; use camino::{Utf8Path, Utf8PathBuf}; +use hyper::Uri; +use nix::unistd::Pid; use pageserver_api::{ controller_api::{ - NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse, - TenantShardMigrateRequest, TenantShardMigrateResponse, + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest, + TenantShardMigrateResponse, }, models::{ - TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse, - TimelineCreateRequest, TimelineInfo, + TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, }, shard::{ShardStripeSize, TenantShardId}, }; @@ -18,7 +20,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; use reqwest::Method; use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use std::{fs, str::FromStr}; +use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock}; use tokio::process::Command; use tracing::instrument; use url::Url; @@ -29,19 +31,52 @@ use utils::{ pub struct StorageController { env: LocalEnv, - listen: String, - path: Utf8PathBuf, private_key: Option>, public_key: Option, - postgres_port: u16, client: reqwest::Client, config: NeonStorageControllerConf, + + // The listen addresses is learned when starting the storage controller, + // hence the use of OnceLock to init it at the right time. + listen: OnceLock, } const COMMAND: &str = "storage_controller"; const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; +const DB_NAME: &str = "storage_controller"; + +pub struct NeonStorageControllerStartArgs { + pub instance_id: u8, + pub base_port: Option, + pub start_timeout: humantime::Duration, +} + +impl NeonStorageControllerStartArgs { + pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self { + Self { + instance_id: 1, + base_port: None, + start_timeout, + } + } +} + +pub struct NeonStorageControllerStopArgs { + pub instance_id: u8, + pub immediate: bool, +} + +impl NeonStorageControllerStopArgs { + pub fn with_default_instance_id(immediate: bool) -> Self { + Self { + instance_id: 1, + immediate, + } + } +} + #[derive(Serialize, Deserialize)] pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, @@ -66,27 +101,6 @@ pub struct InspectResponse { impl StorageController { pub fn from_env(env: &LocalEnv) -> Self { - let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone()) - .unwrap() - .join("attachments.json"); - - // Makes no sense to construct this if pageservers aren't going to use it: assume - // pageservers have control plane API set - let listen_url = env.control_plane_api.clone().unwrap(); - - let listen = format!( - "{}:{}", - listen_url.host_str().unwrap(), - listen_url.port().unwrap() - ); - - // Convention: NeonEnv in python tests reserves the next port after the control_plane_api - // port, for use by our captive postgres. - let postgres_port = listen_url - .port() - .expect("Control plane API setting should always have a port") - + 1; - // Assume all pageservers have symmetric auth configuration: this service // expects to use one JWT token to talk to all of them. let ps_conf = env @@ -129,21 +143,28 @@ impl StorageController { Self { env: env.clone(), - path, - listen, private_key, public_key, - postgres_port, client: reqwest::ClientBuilder::new() .build() .expect("Failed to construct http client"), config: env.storage_controller.clone(), + listen: OnceLock::default(), } } - fn pid_file(&self) -> Utf8PathBuf { - Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid")) - .expect("non-Unicode path") + fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf { + self.env + .base_data_dir + .join(format!("storage_controller_{}", instance_id)) + } + + fn pid_file(&self, instance_id: u8) -> Utf8PathBuf { + Utf8PathBuf::from_path_buf( + self.storage_controller_instance_dir(instance_id) + .join("storage_controller.pid"), + ) + .expect("non-Unicode path") } /// PIDFile for the postgres instance used to store storage controller state @@ -156,16 +177,16 @@ impl StorageController { .expect("non-Unicode path") } - /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl` + /// Find the directory containing postgres subdirectories, such `bin` and `lib` /// /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back /// to other versions if that one isn't found. Some automated tests create circumstances /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. - pub async fn get_pg_bin_dir(&self) -> anyhow::Result { + async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result { let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14]; for v in prefer_versions { - let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap(); + let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap(); if tokio::fs::try_exists(&path).await? { return Ok(path); } @@ -173,30 +194,38 @@ impl StorageController { // Fall through anyhow::bail!( - "Postgres binaries not found in {}", - self.env.pg_distrib_dir.display() + "Postgres directory '{}' not found in {}", + dir_name, + self.env.pg_distrib_dir.display(), ); } + pub async fn get_pg_bin_dir(&self) -> anyhow::Result { + self.get_pg_dir("bin").await + } + + pub async fn get_pg_lib_dir(&self) -> anyhow::Result { + self.get_pg_dir("lib").await + } + /// Readiness check for our postgres process - async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result { + async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result { let bin_path = pg_bin_dir.join("pg_isready"); - let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)]; + let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)]; let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; Ok(exitcode.success()) } - /// Create our database if it doesn't exist, and run migrations. + /// Create our database if it doesn't exist /// /// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers /// who just want to run `cargo neon_local` without knowing about diesel. /// /// Returns the database url - pub async fn setup_database(&self) -> anyhow::Result { - const DB_NAME: &str = "storage_controller"; - let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port); + pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result { + let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); let pg_bin_dir = self.get_pg_bin_dir().await?; let createdb_path = pg_bin_dir.join("createdb"); @@ -205,7 +234,7 @@ impl StorageController { "-h", "localhost", "-p", - &format!("{}", self.postgres_port), + &format!("{}", postgres_port), DB_NAME, ]) .output() @@ -224,80 +253,211 @@ impl StorageController { Ok(database_url) } - pub async fn start(&self) -> anyhow::Result<()> { - // Start a vanilla Postgres process used by the storage controller for persistence. - let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone()) - .unwrap() - .join("storage_controller_db"); - let pg_bin_dir = self.get_pg_bin_dir().await?; - let pg_log_path = pg_data_path.join("postgres.log"); + pub async fn connect_to_database( + &self, + postgres_port: u16, + ) -> anyhow::Result<( + tokio_postgres::Client, + tokio_postgres::Connection, + )> { + tokio_postgres::Config::new() + .host("localhost") + .port(postgres_port) + // The user is the ambient operating system user name. + // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400 + // + // Until we get there, use the ambient operating system user name. + // Recent tokio-postgres versions default to this if the user isn't specified. + // But tokio-postgres fork doesn't have this upstream commit: + // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79 + // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399 + .user(&whoami::username()) + .dbname(DB_NAME) + .connect(tokio_postgres::NoTls) + .await + .map_err(anyhow::Error::new) + } - if !tokio::fs::try_exists(&pg_data_path).await? { - // Initialize empty database - let initdb_path = pg_bin_dir.join("initdb"); - let mut child = Command::new(&initdb_path) - .args(["-D", pg_data_path.as_ref()]) - .spawn() - .expect("Failed to spawn initdb"); - let status = child.wait().await?; - if !status.success() { - anyhow::bail!("initdb failed with status {status}"); + pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> { + let instance_dir = self.storage_controller_instance_dir(start_args.instance_id); + if let Err(err) = tokio::fs::create_dir(&instance_dir).await { + if err.kind() != std::io::ErrorKind::AlreadyExists { + panic!("Failed to create instance dir {instance_dir:?}"); } + } + + let (listen, postgres_port) = { + if let Some(base_port) = start_args.base_port { + ( + format!("127.0.0.1:{base_port}"), + self.config + .database_url + .expect("--base-port requires NeonStorageControllerConf::database_url") + .port(), + ) + } else { + let listen_url = self.env.control_plane_api.clone().unwrap(); + + let listen = format!( + "{}:{}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + ); + + (listen, listen_url.port().unwrap() + 1) + } + }; + + let socket_addr = listen + .parse() + .expect("listen address is a valid socket address"); + self.listen + .set(socket_addr) + .expect("StorageController::listen is only set here"); + + // Do we remove the pid file on stop? + let pg_started = self.is_postgres_running().await?; + let pg_lib_dir = self.get_pg_lib_dir().await?; + + if !pg_started { + // Start a vanilla Postgres process used by the storage controller for persistence. + let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone()) + .unwrap() + .join("storage_controller_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + let pg_log_path = pg_data_path.join("postgres.log"); + + if !tokio::fs::try_exists(&pg_data_path).await? { + // Initialize empty database + let initdb_path = pg_bin_dir.join("initdb"); + let mut child = Command::new(&initdb_path) + .envs(vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]) + .args(["-D", pg_data_path.as_ref()]) + .spawn() + .expect("Failed to spawn initdb"); + let status = child.wait().await?; + if !status.success() { + anyhow::bail!("initdb failed with status {status}"); + } + }; // Write a minimal config file: // - Specify the port, since this is chosen dynamically // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing // the storage controller we don't want a slow local disk to interfere with that. + // + // NB: it's important that we rewrite this file on each start command so we propagate changes + // from `LocalEnv`'s config file (`.neon/config`). tokio::fs::write( &pg_data_path.join("postgresql.conf"), - format!("port = {}\nfsync=off\n", self.postgres_port), + format!("port = {}\nfsync=off\n", postgres_port), ) .await?; + + println!("Starting storage controller database..."); + let db_start_args = [ + "-w", + "-D", + pg_data_path.as_ref(), + "-l", + pg_log_path.as_ref(), + "start", + ]; + + background_process::start_process( + "storage_controller_db", + &self.env.base_data_dir, + pg_bin_dir.join("pg_ctl").as_std_path(), + db_start_args, + vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ], + background_process::InitialPidFile::Create(self.postgres_pid_file()), + &start_args.start_timeout, + || self.pg_isready(&pg_bin_dir, postgres_port), + ) + .await?; + + self.setup_database(postgres_port).await?; + } + + let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); + + // We support running a startup SQL script to fiddle with the database before we launch storcon. + // This is used by the test suite. + let startup_script_path = self + .env + .base_data_dir + .join("storage_controller_db.startup.sql"); + let startup_script = match tokio::fs::read_to_string(&startup_script_path).await { + Ok(script) => { + tokio::fs::remove_file(startup_script_path).await?; + script + } + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + // always run some startup script so that this code path doesn't bit rot + "BEGIN; COMMIT;".to_string() + } else { + anyhow::bail!("Failed to read startup script: {e}") + } + } }; + let (mut client, conn) = self.connect_to_database(postgres_port).await?; + let conn = tokio::spawn(conn); + let tx = client.build_transaction(); + let tx = tx.start().await?; + tx.batch_execute(&startup_script).await?; + tx.commit().await?; + drop(client); + conn.await??; - println!("Starting storage controller database..."); - let db_start_args = [ - "-w", - "-D", - pg_data_path.as_ref(), - "-l", - pg_log_path.as_ref(), - "start", - ]; - - background_process::start_process( - "storage_controller_db", - &self.env.base_data_dir, - pg_bin_dir.join("pg_ctl").as_std_path(), - db_start_args, - [], - background_process::InitialPidFile::Create(self.postgres_pid_file()), - || self.pg_isready(&pg_bin_dir), - ) - .await?; - - // Run migrations on every startup, in case something changed. - let database_url = self.setup_database().await?; + let listen = self + .listen + .get() + .expect("cell is set earlier in this function"); + let address_for_peers = Uri::builder() + .scheme("http") + .authority(format!("{}:{}", listen.ip(), listen.port())) + .path_and_query("") + .build() + .unwrap(); let mut args = vec![ "-l", - &self.listen, - "-p", - self.path.as_ref(), + &listen.to_string(), "--dev", "--database-url", &database_url, - "--max-unavailable-interval", - &humantime::Duration::from(self.config.max_unavailable).to_string(), + "--max-offline-interval", + &humantime::Duration::from(self.config.max_offline).to_string(), + "--max-warming-up-interval", + &humantime::Duration::from(self.config.max_warming_up).to_string(), + "--address-for-peers", + &address_for_peers.to_string(), ] .into_iter() .map(|s| s.to_string()) .collect::>(); + + if self.config.start_as_candidate { + args.push("--start-as-candidate".to_string()); + } + if let Some(private_key) = &self.private_key { let claims = Claims::new(None, Scope::PageServerApi); let jwt_token = encode_from_key_file(&claims, private_key).expect("failed to generate jwt token"); args.push(format!("--jwt-token={jwt_token}")); + + let peer_claims = Claims::new(None, Scope::Admin); + let peer_jwt_token = encode_from_key_file(&peer_claims, private_key) + .expect("failed to generate jwt token"); + args.push(format!("--peer-jwt-token={peer_jwt_token}")); } if let Some(public_key) = &self.public_key { @@ -314,6 +474,10 @@ impl StorageController { args.push(format!("--split-threshold={split_threshold}")) } + if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() { + args.push(format!("--max-secondary-lag-bytes={lag}")) + } + args.push(format!( "--neon-local-repo-dir={}", self.env.base_data_dir.display() @@ -321,11 +485,15 @@ impl StorageController { background_process::start_process( COMMAND, - &self.env.base_data_dir, + &instance_dir, &self.env.storage_controller_bin(), args, - [], - background_process::InitialPidFile::Create(self.pid_file()), + vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ], + background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)), + &start_args.start_timeout, || async { match self.ready().await { Ok(_) => Ok(true), @@ -338,8 +506,35 @@ impl StorageController { Ok(()) } - pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> { - background_process::stop_process(immediate, COMMAND, &self.pid_file())?; + pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> { + background_process::stop_process( + stop_args.immediate, + COMMAND, + &self.pid_file(stop_args.instance_id), + )?; + + let storcon_instances = self.env.storage_controller_instances().await?; + for (instance_id, instanced_dir_path) in storcon_instances { + if instance_id == stop_args.instance_id { + continue; + } + + let pid_file = instanced_dir_path.join("storage_controller.pid"); + let pid = tokio::fs::read_to_string(&pid_file) + .await + .map_err(|err| { + anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}") + })? + .parse::() + .expect("pid is valid i32"); + + let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?; + if other_proc_alive { + // There is another storage controller instance running, so we return + // and leave the database running. + return Ok(()); + } + } let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); let pg_bin_dir = self.get_pg_bin_dir().await?; @@ -352,27 +547,51 @@ impl StorageController { .wait() .await?; if !stop_status.success() { - let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; - let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl")) - .args(pg_status_args) - .spawn()? - .wait() - .await?; - - // pg_ctl status returns this exit code if postgres is not running: in this case it is - // fine that stop failed. Otherwise it is an error that stop failed. - const PG_STATUS_NOT_RUNNING: i32 = 3; - if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() { - println!("Storage controller database is already stopped"); - return Ok(()); - } else { - anyhow::bail!("Failed to stop storage controller database: {stop_status}") + match self.is_postgres_running().await { + Ok(false) => { + println!("Storage controller database is already stopped"); + return Ok(()); + } + Ok(true) => { + anyhow::bail!("Failed to stop storage controller database"); + } + Err(err) => { + anyhow::bail!("Failed to stop storage controller database: {err}"); + } } } Ok(()) } + async fn is_postgres_running(&self) -> anyhow::Result { + let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + + let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; + let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl")) + .args(pg_status_args) + .spawn()? + .wait() + .await?; + + // pg_ctl status returns this exit code if postgres is not running: in this case it is + // fine that stop failed. Otherwise it is an error that stop failed. + const PG_STATUS_NOT_RUNNING: i32 = 3; + const PG_NO_DATA_DIR: i32 = 4; + const PG_STATUS_RUNNING: i32 = 0; + match status_exitcode.code() { + Some(PG_STATUS_NOT_RUNNING) => Ok(false), + Some(PG_NO_DATA_DIR) => Ok(false), + Some(PG_STATUS_RUNNING) => Ok(true), + Some(code) => Err(anyhow::anyhow!( + "pg_ctl status returned unexpected status code: {:?}", + code + )), + None => Err(anyhow::anyhow!("pg_ctl status returned no status code")), + } + } + fn get_claims_for_path(path: &str) -> anyhow::Result> { let category = match path.find('/') { Some(idx) => &path[..idx], @@ -398,15 +617,31 @@ impl StorageController { RQ: Serialize + Sized, RS: DeserializeOwned + Sized, { - // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out - // for general purpose API access. - let listen_url = self.env.control_plane_api.clone().unwrap(); - let url = Url::from_str(&format!( - "http://{}:{}/{path}", - listen_url.host_str().unwrap(), - listen_url.port().unwrap() - )) - .unwrap(); + // In the special case of the `storage_controller start` subcommand, we wish + // to use the API endpoint of the newly started storage controller in order + // to pass the readiness check. In this scenario [`Self::listen`] will be set + // (see [`Self::start`]). + // + // Otherwise, we infer the storage controller api endpoint from the configured + // control plane API. + let url = if let Some(socket_addr) = self.listen.get() { + Url::from_str(&format!( + "http://{}:{}/{path}", + socket_addr.ip().to_canonical(), + socket_addr.port() + )) + .unwrap() + } else { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let listen_url = self.env.control_plane_api.clone().unwrap(); + Url::from_str(&format!( + "http://{}:{}/{path}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + )) + .unwrap() + }; let mut builder = self.client.request(method, url); if let Some(body) = body { @@ -555,6 +790,15 @@ impl StorageController { .await } + pub async fn node_list(&self) -> anyhow::Result> { + self.dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await + } + #[instrument(skip(self))] pub async fn ready(&self) -> anyhow::Result<()> { self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None) diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml index f96f0084b2..be69208d0d 100644 --- a/control_plane/storcon_cli/Cargo.toml +++ b/control_plane/storcon_cli/Cargo.toml @@ -17,6 +17,7 @@ pageserver_client.workspace = true reqwest.workspace = true serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } +storage_controller_client.workspace = true thiserror.workspace = true tokio.workspace = true tracing.workspace = true diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 7b48b75c21..e27491c1c8 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1,28 +1,28 @@ use futures::StreamExt; -use std::{collections::HashMap, str::FromStr, time::Duration}; +use std::{str::FromStr, time::Duration}; use clap::{Parser, Subcommand}; use pageserver_api::{ controller_api::{ - NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, + NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, - ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest, - TenantShardSplitRequest, TenantShardSplitResponse, + ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest, + TenantShardSplitResponse, }, shard::{ShardStripeSize, TenantShardId}, }; -use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; +use pageserver_client::mgmt_api::{self}; use reqwest::{Method, StatusCode, Url}; -use serde::{de::DeserializeOwned, Serialize}; use utils::id::{NodeId, TenantId}; use pageserver_api::controller_api::{ NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, - TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse, + TenantShardMigrateRequest, TenantShardMigrateResponse, }; +use storage_controller_client::control_api::Client; #[derive(Subcommand, Debug)] enum Command { @@ -56,6 +56,10 @@ enum Command { #[arg(long)] scheduling: Option, }, + NodeDelete { + #[arg(long)] + node_id: NodeId, + }, /// Modify a tenant's policies in the storage controller TenantPolicy { #[arg(long)] @@ -110,12 +114,6 @@ enum Command { #[arg(long)] config: String, }, - /// Attempt to balance the locations for a tenant across pageservers. This is a client-side - /// alternative to the storage controller's scheduling optimization behavior. - TenantScatter { - #[arg(long)] - tenant_id: TenantId, - }, /// Print details about a particular tenant, including all its shards' states. TenantDescribe { #[arg(long)] @@ -251,64 +249,6 @@ impl FromStr for NodeAvailabilityArg { } } -struct Client { - base_url: Url, - jwt_token: Option, - client: reqwest::Client, -} - -impl Client { - fn new(base_url: Url, jwt_token: Option) -> Self { - Self { - base_url, - jwt_token, - client: reqwest::ClientBuilder::new() - .build() - .expect("Failed to construct http client"), - } - } - - /// Simple HTTP request wrapper for calling into storage controller - async fn dispatch( - &self, - method: Method, - path: String, - body: Option, - ) -> mgmt_api::Result - where - RQ: Serialize + Sized, - RS: DeserializeOwned + Sized, - { - // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out - // for general purpose API access. - let url = Url::from_str(&format!( - "http://{}:{}/{path}", - self.base_url.host_str().unwrap(), - self.base_url.port().unwrap() - )) - .unwrap(); - - let mut builder = self.client.request(method, url); - if let Some(body) = body { - builder = builder.json(&body) - } - if let Some(jwt_token) = &self.jwt_token { - builder = builder.header( - reqwest::header::AUTHORIZATION, - format!("Bearer {jwt_token}"), - ); - } - - let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?; - let response = response.error_from_body().await?; - - response - .json() - .await - .map_err(pageserver_client::mgmt_api::Error::ReceiveBody) - } -} - #[tokio::main] async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); @@ -342,14 +282,18 @@ async fn main() -> anyhow::Result<()> { .await?; } Command::TenantCreate { tenant_id } => { - vps_client - .tenant_create(&TenantCreateRequest { - new_tenant_id: TenantShardId::unsharded(tenant_id), - generation: None, - shard_parameters: ShardParameters::default(), - placement_policy: Some(PlacementPolicy::Attached(1)), - config: TenantConfig::default(), - }) + storcon_client + .dispatch::<_, ()>( + Method::POST, + "v1/tenant".to_string(), + Some(TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation: None, + shard_parameters: ShardParameters::default(), + placement_policy: Some(PlacementPolicy::Attached(1)), + config: TenantConfig::default(), + }), + ) .await?; } Command::TenantDelete { tenant_id } => { @@ -359,13 +303,16 @@ async fn main() -> anyhow::Result<()> { tracing::info!("Delete status: {}", status); } Command::Nodes {} => { - let resp = storcon_client + let mut resp = storcon_client .dispatch::<(), Vec>( Method::GET, "control/v1/node".to_string(), None, ) .await?; + + resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr)); + let mut table = comfy_table::Table::new(); table.set_header(["Id", "Hostname", "Scheduling", "Availability"]); for node in resp { @@ -397,13 +344,16 @@ async fn main() -> anyhow::Result<()> { .await?; } Command::Tenants {} => { - let resp = storcon_client + let mut resp = storcon_client .dispatch::<(), Vec>( Method::GET, "control/v1/tenant".to_string(), None, ) .await?; + + resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id)); + let mut table = comfy_table::Table::new(); table.set_header([ "TenantId", @@ -498,88 +448,6 @@ async fn main() -> anyhow::Result<()> { }) .await?; } - Command::TenantScatter { tenant_id } => { - // Find the shards - let locate_response = storcon_client - .dispatch::<(), TenantLocateResponse>( - Method::GET, - format!("control/v1/tenant/{tenant_id}/locate"), - None, - ) - .await?; - let shards = locate_response.shards; - - let mut node_to_shards: HashMap> = HashMap::new(); - let shard_count = shards.len(); - for s in shards { - let entry = node_to_shards.entry(s.node_id).or_default(); - entry.push(s.shard_id); - } - - // Load list of available nodes - let nodes_resp = storcon_client - .dispatch::<(), Vec>( - Method::GET, - "control/v1/node".to_string(), - None, - ) - .await?; - - for node in nodes_resp { - if matches!(node.availability, NodeAvailabilityWrapper::Active) { - node_to_shards.entry(node.id).or_default(); - } - } - - let max_shard_per_node = shard_count / node_to_shards.len(); - - loop { - let mut migrate_shard = None; - for shards in node_to_shards.values_mut() { - if shards.len() > max_shard_per_node { - // Pick the emptiest - migrate_shard = Some(shards.pop().unwrap()); - } - } - let Some(migrate_shard) = migrate_shard else { - break; - }; - - // Pick the emptiest node to migrate to - let mut destinations = node_to_shards - .iter() - .map(|(k, v)| (k, v.len())) - .collect::>(); - destinations.sort_by_key(|i| i.1); - let (destination_node, destination_count) = *destinations.first().unwrap(); - if destination_count + 1 > max_shard_per_node { - // Even the emptiest destination doesn't have space: we're done - break; - } - let destination_node = *destination_node; - - node_to_shards - .get_mut(&destination_node) - .unwrap() - .push(migrate_shard); - - println!("Migrate {} -> {} ...", migrate_shard, destination_node); - - storcon_client - .dispatch::( - Method::PUT, - format!("control/v1/tenant/{migrate_shard}/migrate"), - Some(TenantShardMigrateRequest { - tenant_shard_id: migrate_shard, - node_id: destination_node, - }), - ) - .await?; - println!("Migrate {} -> {} OK", migrate_shard, destination_node); - } - - // Spread the shards across the nodes - } Command::TenantDescribe { tenant_id } => { let describe_response = storcon_client .dispatch::<(), TenantDescribeResponse>( @@ -734,6 +602,11 @@ async fn main() -> anyhow::Result<()> { .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None) .await?; } + Command::NodeDelete { node_id } => { + storcon_client + .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None) + .await?; + } Command::TenantSetTimeBasedEviction { tenant_id, period, @@ -749,6 +622,7 @@ async fn main() -> anyhow::Result<()> { threshold: threshold.into(), }, )), + heatmap_period: Some("300s".to_string()), ..Default::default() }, }) diff --git a/deny.toml b/deny.toml index 469609c496..327ac58db7 100644 --- a/deny.toml +++ b/deny.toml @@ -4,6 +4,7 @@ # to your expectations and requirements. # Root options +[graph] targets = [ { triple = "x86_64-unknown-linux-gnu" }, { triple = "aarch64-unknown-linux-gnu" }, @@ -12,6 +13,7 @@ targets = [ ] all-features = false no-default-features = false +[output] feature-depth = 1 # This section is considered when running `cargo deny check advisories` @@ -19,17 +21,16 @@ feature-depth = 1 # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html [advisories] db-urls = ["https://github.com/rustsec/advisory-db"] -vulnerability = "deny" -unmaintained = "warn" yanked = "warn" -notice = "warn" -ignore = [] + +[[advisories.ignore]] +id = "RUSTSEC-2023-0071" +reason = "the marvin attack only affects private key decryption, not public key signature verification" # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html [licenses] -unlicensed = "deny" allow = [ "Apache-2.0", "Artistic-2.0", @@ -42,10 +43,6 @@ allow = [ "OpenSSL", "Unicode-DFS-2016", ] -deny = [] -copyleft = "warn" -allow-osi-fsf-free = "neither" -default = "deny" confidence-threshold = 0.8 exceptions = [ # Zlib license has some restrictions if we decide to change sth diff --git a/docker-compose/README.md b/docker-compose/README.md new file mode 100644 index 0000000000..bd47805a67 --- /dev/null +++ b/docker-compose/README.md @@ -0,0 +1,10 @@ + +# Example docker compose configuration + +The configuration in this directory is used for testing Neon docker images: it is +not intended for deploying a usable system. To run a development environment where +you can experiment with a minature Neon system, use `cargo neon` rather than container images. + +This configuration does not start the storage controller, because the controller +needs a way to reconfigure running computes, and no such thing exists in this setup. + diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 22660a63ce..33455e458a 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -23,18 +23,17 @@ echo "Page server is ready." echo "Create a tenant and timeline" generate_id tenant_id PARAMS=( - -sb - -X POST + -X PUT -H "Content-Type: application/json" - -d "{\"new_tenant_id\": \"${tenant_id}\"}" - http://pageserver:9898/v1/tenant/ + -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}" + "http://pageserver:9898/v1/tenant/${tenant_id}/location_config" ) result=$(curl "${PARAMS[@]}") echo $result | jq . generate_id timeline_id PARAMS=( - -sb + -sbf -X POST -H "Content-Type: application/json" -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 5503b6611a..6e15fdbe0d 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -31,25 +31,14 @@ services: restart: always image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} environment: - - BROKER_ENDPOINT='http://storage_broker:50051' - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 ports: #- 6400:6400 # pg protocol handler - 9898:9898 # http endpoints - entrypoint: - - "/bin/sh" - - "-c" - command: - - "/usr/local/bin/pageserver -D /data/.neon/ - -c \"broker_endpoint=$$BROKER_ENDPOINT\" - -c \"listen_pg_addr='0.0.0.0:6400'\" - -c \"listen_http_addr='0.0.0.0:9898'\" - -c \"remote_storage={endpoint='http://minio:9000', - bucket_name='neon', - bucket_region='eu-north-1', - prefix_in_bucket='/pageserver/'}\"" + volumes: + - ./pageserver_config:/data/.neon/ depends_on: - storage_broker - minio_create_buckets diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index a00591afd0..10805a9952 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -78,7 +78,7 @@ for pg_version in 14 15 16; do docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/ rm -rf $TMPDIR # We are running tests now - if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ + if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt then cleanup diff --git a/docker-compose/pageserver_config/identity.toml b/docker-compose/pageserver_config/identity.toml new file mode 100644 index 0000000000..20121327c7 --- /dev/null +++ b/docker-compose/pageserver_config/identity.toml @@ -0,0 +1 @@ +id=1234 diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml new file mode 100644 index 0000000000..76935453b6 --- /dev/null +++ b/docker-compose/pageserver_config/pageserver.toml @@ -0,0 +1,5 @@ +broker_endpoint='http://storage_broker:50051' +pg_distrib_dir='/usr/local/' +listen_pg_addr='0.0.0.0:6400' +listen_http_addr='0.0.0.0:9898' +remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' } diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh index c05fc159aa..58b2581197 100644 --- a/docker-compose/run-tests.sh +++ b/docker-compose/run-tests.sh @@ -1,15 +1,15 @@ #!/bin/bash set -x -cd /ext-src +cd /ext-src || exit 2 FAILED= -LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u) +LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u) for d in ${LIST} do - [ -d ${d} ] || continue + [ -d "${d}" ] || continue psql -c "select 1" >/dev/null || break - make -C ${d} installcheck || FAILED="${d} ${FAILED}" + USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" done [ -z "${FAILED}" ] && exit 0 -echo ${FAILED} +echo "${FAILED}" exit 1 \ No newline at end of file diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index b275349168..5fd4080c28 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -1,13 +1,18 @@ # Summary +# Looking for `neon.tech` docs? + +This page linkes to a selection of technical content about the open source code in this repository. + +Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code +in this repository. + +# Architecture + [Introduction]() - [Separation of Compute and Storage](./separation-compute-storage.md) -# Architecture - - [Compute]() - - [WAL proposer]() - - [WAL Backpressure]() - [Postgres changes](./core_changes.md) - [Pageserver](./pageserver.md) @@ -16,33 +21,15 @@ - [WAL Redo](./pageserver-walredo.md) - [Page cache](./pageserver-pagecache.md) - [Storage](./pageserver-storage.md) - - [Datadir mapping]() - - [Layer files]() - - [Branching]() - - [Garbage collection]() - - [Cloud Storage]() - [Processing a GetPage request](./pageserver-processing-getpage.md) - [Processing WAL](./pageserver-processing-wal.md) - - [Management API]() - - [Tenant Rebalancing]() - [WAL Service](walservice.md) - [Consensus protocol](safekeeper-protocol.md) - - [Management API]() - - [Rebalancing]() - -- [Control Plane]() - -- [Proxy]() - [Source view](./sourcetree.md) - [docker.md](./docker.md) — Docker images and building pipeline. - [Error handling and logging](./error-handling.md) - - [Testing]() - - [Unit testing]() - - [Integration testing]() - - [Benchmarks]() - - [Glossary](./glossary.md) @@ -58,28 +45,6 @@ # RFCs -- [RFCs](./rfcs/README.md) - -- [002-storage](rfcs/002-storage.md) -- [003-laptop-cli](rfcs/003-laptop-cli.md) -- [004-durability](rfcs/004-durability.md) -- [005-zenith_local](rfcs/005-zenith_local.md) -- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md) -- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md) -- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md) -- [008-push-pull](rfcs/008-push-pull.md) -- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md) -- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md) -- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md) -- [010-storage_details](rfcs/010-storage_details.md) -- [011-retention-policy](rfcs/011-retention-policy.md) -- [012-background-tasks](rfcs/012-background-tasks.md) -- [013-term-history](rfcs/013-term-history.md) -- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md) -- [014-storage-lsm](rfcs/014-storage-lsm.md) -- [015-storage-messaging](rfcs/015-storage-messaging.md) -- [016-connection-routing](rfcs/016-connection-routing.md) -- [017-timeline-data-management](rfcs/017-timeline-data-management.md) -- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md) -- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md) -- [cluster-size-limits](rfcs/cluster-size-limits.md) +Major changes are documented in RFCS: +- See [RFCs](./rfcs/README.md) for more information +- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs diff --git a/docs/pageserver-pagecache.md b/docs/pageserver-pagecache.md index d9b120bbb9..d022742dff 100644 --- a/docs/pageserver-pagecache.md +++ b/docs/pageserver-pagecache.md @@ -5,4 +5,3 @@ TODO: - shared across tenants - store pages from layer files - store pages from "in-memory layer" -- store materialized pages diff --git a/docs/rfcs/033-storage-controller-drain-and-fill.md b/docs/rfcs/033-storage-controller-drain-and-fill.md new file mode 100644 index 0000000000..733f7c0bd8 --- /dev/null +++ b/docs/rfcs/033-storage-controller-drain-and-fill.md @@ -0,0 +1,345 @@ +# Graceful Restarts of Storage Controller Managed Clusters + +## Summary +This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes. +It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement +graceful cluster restarts. + +## Motivation + +Pageserver restarts cause read availablity downtime for tenants. + +For example pageserver-3 @ us-east-1 was unavailable for a randomly +picked tenant (which requested on-demand activation) for around 30 seconds +during the restart at 2024-04-03 16:37 UTC. + +Note that lots of shutdowns on loaded pageservers do not finish within the +[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers +and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse. + +This problem is not yet very acutely felt in storage controller managed pageservers since +tenant density is much lower there. However, we are planning on eventually migrating all +pageservers to storage controller management, so it makes sense to solve the issue proactively. + +## Requirements + +- Pageserver re-deployments cause minimal downtime for tenants +- The storage controller exposes HTTP API hooks for draining and filling tenant shards +from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator. +- The storage controller exposes some HTTP API to cancel draining and filling background operations. +- Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed +as usual (with downtime). +- Progress of draining/filling is visible through metrics + +## Non Goals + +- Integration with the control plane +- Graceful restarts for large non-HA tenants. + +## Impacted Components + +- storage controller +- deployment orchestrator (i.e. Ansible) +- pageserver (indirectly) + +## Terminology + +** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver +are distributed across the rest of the cluster. + +** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given +pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers. + +** Node scheduling policies ** act as constraints to the scheduler. For instance, when a +node is set in the `Paused` policy, no further shards will be scheduled on it. + +** Node ** is a pageserver. Term is used interchangeably in this RFC. + +** Deployment orchestrator ** is a generic term for whatever drives our deployments. +Currently, it's an Ansible playbook. + +## Background + +### Storage Controller Basics (skip if already familiar) + +Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers. + +An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook. + +### Background Optimizations + +The storage controller performs scheduling optimizations in the background. It will +migrate attachments to warm secondaries and replace secondaries in order to balance +the cluster out. + +### Reconciliations Concurrency Limiting + +There's a hard limit on the number of reconciles that the storage controller +can have in flight at any given time. To get an idea of scales, the limit is +128 at the time of writing. + +## Implementation + +Note: this section focuses on the core functionality of the graceful restart process. +It doesn't neccesarily describe the most efficient approach. Optimizations are described +separately in a later section. + +### Overall Flow + +This section describes how to implement graceful restarts from the perspective +of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially. +The orchestrator shall implement the following epilogue and prologue steps for each +pageserver restart: + +#### Prologue + +The orchestrator shall first fetch the pageserver node id from the control plane or +the pageserver it aims to restart directly. Next, it issues an HTTP request +to the storage controller in order to start the drain of said pageserver node. +All error responses are retried with a short back-off. When a 202 (Accepted) +HTTP code is returned, the drain has started. Now the orchestrator polls the +node status endpoint exposed by the storage controller in order to await the +end of the drain process. When the `policy` field of the node status response +becomes `PauseForRestart`, the drain has completed and the orchestrator can +proceed with restarting the pageserver. + +The prologue is subject to an overall timeout. It will have a value in the ballpark +of minutes. As storage controller managed pageservers become more loaded this timeout +will likely have to increase. + +#### Epilogue + +After restarting the pageserver, the orchestrator issues an HTTP request +to the storage controller to kick off the filling process. This API call +may be retried for all error codes with a short backoff. This also serves +as a synchronization primitive as the fill will be refused if the pageserver +has not yet re-attached to the storage controller. When a 202(Accepted) HTTP +code is returned, the fill has started. Now the orchestrator polls the node +status endpoint exposed by the storage controller in order to await the end of +the filling process. When the `policy` field of the node status response becomes +`Active`, the fill has completed and the orchestrator may proceed to the next pageserver. + +Again, the epilogue is subject to an overall timeout. We can start off with +using the same timeout as for the prologue, but can also consider relying on +the storage controller's background optimizations with a shorter timeout. + +In the case that the deployment orchestrator times out, it attempts to cancel +the fill. This operation shall be retried with a short back-off. If it ultimately +fails it will require manual intervention to set the nodes scheduling policy to +`NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic, +but it constrains the scheduler as mentioned previously. + +### Node Scheduling Policy State Machine + +The state machine below encodes the behaviours discussed above and +the various failover situations described in a later section. + +Assuming no failures and/or timeouts the flow should be: +`Active -> Draining -> PauseForRestart -> Active -> Filling -> Active` + +``` + Operator requested drain + +-----------------------------------------+ + | | + +-------+-------+ +-------v-------+ + | | | | + | Pause | +-----------> Draining +----------+ + | | | | | | + +---------------+ | +-------+-------+ | + | | | + | | | + Drain requested| | | + | |Drain complete | Drain failed + | | | Cancelled/PS reattach/Storcon restart + | | | + +-------+-------+ | | + | | | | + +-------------+ Active <-----------+------------------+ + | | | | +Fill requested | +---^---^-------+ | + | | | | + | | | | + | | | | + | Fill completed| | | + | | |PS reattach | + | | |after restart | + +-------v-------+ | | +-------v-------+ + | | | | | | + | Filling +---------+ +-----------+PauseForRestart| + | | | | + +---------------+ +---------------+ +``` + +### Draining/Filling APIs + +The storage controller API to trigger the draining of a given node is: +`PUT /v1/control/node/:node_id/{drain,fill}`. + +The following HTTP non-success return codes are used. +All of them are safely retriable from the perspective of the storage controller. +- 404: Requested node was not found +- 503: Requested node is known to the storage controller, but unavailable +- 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining +- 409: A {drain, fill} is already in progress. Only one such background operation +is allowed per node. + +When the drain is accepted and commenced a 202 HTTP code is returned. + +Drains and fills shall be cancellable by the deployment orchestrator or a +human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200 +response is returned when the cancelation is successful. Errors are retriable. + +### Drain Process + +Before accpeting a drain request the following validations is applied: +* Ensure that the node is known the storage controller +* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause` +* Ensure that another drain or fill is not already running on the node +* Ensure that a drain is possible (i.e. check that there is at least one +schedulable node to drain to) + +After accepting the drain, the scheduling policy of the node is set to +`NodeSchedulingPolicy::Draining` and persisted in both memory and the database. +This disallows the optimizer from adding or removing shards from the node which +is desirable to avoid them racing. + +Next, a separate Tokio task is spawned to manage the draining. For each tenant +shard attached to the node being drained, demote the node to a secondary and +attempt to schedule the node away. Scheduling might fail due to unsatisfiable +constraints, but that is fine. Draining is a best effort process since it might +not always be possible to cut over all shards. + +Importantly, this task manages the concurrency of issued reconciles in order to +avoid drowning out the target pageservers and to allow other important reconciles +to proceed. + +Once the triggered reconciles have finished or timed out, set the node's scheduling +policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain. + +A note on non HA tenants: These tenants do not have secondaries, so by the description +above, they would not be migrated. It makes sense to skip them (especially the large ones) +since, depending on tenant size, this might be more disruptive than the restart since the +pageserver we've moved to do will need to on-demand download the entire working set for the tenant. +We can consider expanding to small non-HA tenants in the future. + +### Fill Process + +Before accpeting a fill request the following validations is applied: +* Ensure that the node is known the storage controller +* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`. +This is the only acceptable policy for the fill starting state. When a node re-attaches, +it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to +`NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain). +* Ensure that another drain or fill is not already running on the node + +After accepting the drain, the scheduling policy of the node is set to +`NodeSchedulingPolicy::Filling` and persisted in both memory and the database. +This disallows the optimizer from adding or removing shards from the node which +is desirable to avoid them racing. + +Next, a separate Tokio task is spawned to manage the draining. For each tenant +shard where the filled node is a secondary, promote the secondary. This is done +until we run out of shards or the counts of attached shards become balanced across +the cluster. + +Like for draining, the concurrency of spawned reconciles is limited. + +### Failure Modes & Handling + +Failures are generally handled by transition back into the `Active` +(neutral) state. This simplifies the implementation greatly at the +cost of adding transitions to the state machine. For example, we +could detect the `Draining` state upon restart and proceed with a drain, +but how should the storage controller know that's what the orchestrator +needs still? + +#### Storage Controller Crash + +When the storage controller starts up reset the node scheduling policy +of all nodes in states `Draining`, `Filling` or `PauseForRestart` to +`Active`. The rationale is that when the storage controller restarts, +we have lost context of what the deployment orchestrator wants. It also +has the benefit of making things easier to reason about. + +#### Pageserver Crash During Drain + +The pageserver will attempt to re-attach during restart at which +point the node scheduling policy will be set back to `Active`, thus +reenabling the scheduler to use the node. + +#### Non-drained Pageserver Crash During Drain + +What should happen when a pageserver we are draining to crashes during the +process. Two reasonable options are: cancel the drain and focus on the failover +*or* do both, but prioritise failover. Since the number of concurrent reconciles +produced by drains/fills are limited, we get the later behaviour for free. +My suggestion is we take this approach, but the cancellation option is trivial +to implement as well. + +#### Pageserver Crash During Fill + +The pageserver will attempt to re-attach during restart at which +point the node scheduling policy will be set back to `Active`, thus +reenabling the scheduler to use the node. + +#### Pageserver Goes unavailable During Drain/Fill + +The drain and fill jobs handle this by stopping early. When the pageserver +is detected as online by storage controller heartbeats, reset its scheduling +policy to `Active`. If a restart happens instead, see the pageserver crash +failure mode. + +#### Orchestrator Drain Times Out + +Orchestrator will still proceed with the restart. +When the pageserver re-attaches, the scheduling policy is set back to +`Active`. + +#### Orchestrator Fill Times Out + +Orchestrator will attempt to cancel the fill operation. If that fails, +the fill will continue until it quiesces and the node will be left +in the `Filling` scheduling policy. This hinders the scheduler, but is +otherwise harmless. A human operator can handle this by setting the scheduling +policy to `Active`, or we can bake in a fill timeout into the storage controller. + +## Optimizations + +### Location Warmth + +When cutting over to a secondary, the storage controller will wait for it to +become "warm" (i.e. download enough of the tenants data). This means that some +reconciliations can take significantly longer than others and hold up precious +reconciliations units. As an optimization, the drain stage can only cut over +tenants that are already "warm". Similarly, the fill stage can prioritise the +"warmest" tenants in the fill. + +Given that the number of tenants by the storage controller will be fairly low +for the foreseable future, the first implementation could simply query the tenants +for secondary status. This doesn't scale well with increasing tenant counts, so +eventually we will need new pageserver API endpoints to report the sets of +"warm" and "cold" nodes. + +## Alternatives Considered + +### Draining and Filling Purely as Scheduling Constraints + +At its core, the storage controller is a big background loop that detects changes +in the environment and reacts on them. One could express draining and filling +of nodes purely in terms of constraining the scheduler (as opposed to having +such background tasks). + +While theoretically nice, I think that's harder to implement and more importantly operate and reason about. +Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create +an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish +to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong +to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion. + +It would also mean that reconciliations themselves have side effects that persist in the database +(persist something to the databse when the drain is done), which I'm not conceptually fond of. + +## Proof of Concept + +This RFC is accompanied by a POC which implements nearly everything mentioned here +apart from the optimizations and some of the failure handling: +https://github.com/neondatabase/neon/pull/7682 diff --git a/docs/rfcs/034-ancestor-deletion.md b/docs/rfcs/034-ancestor-deletion.md new file mode 100644 index 0000000000..7341d930e2 --- /dev/null +++ b/docs/rfcs/034-ancestor-deletion.md @@ -0,0 +1,252 @@ +# Ancestor Timeline Deletion + +Created on: 2024-02-23 + +Author: John Spray + +# Summary + +When a tenant creates a new timeline that they will treat as their 'main' history, +it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently +this is necessary because it is forbidden to delete a timeline which has descendents. + +A new pageserver API is proposed to 'adopt' data from a parent timeline into +one of its children, such that the link between ancestor and child can be severed, +leaving the parent in a state where it may then be deleted. + +# Motivation + +Retaining parent timelines currently has two costs: + +- Cognitive load on users, who have to remember which is the "real" main timeline. +- Storage capacity cost, as the parent timeline will retain layers up to the + child's timeline point, even if the child fully covers its keyspace with image + layers and will never actually read from the parent. + +# Solution + +A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor` +will be added. The `timeline_id` in this URL is that of the _child_ timeline that we +wish to detach from its parent. + +On success, this API will leave the following state: + +- The detached child timeline will no longer have an ancestor, and will contain all + the data needed to service reads without recursing into an ancestor. +- Any other children of the parent whose timeline points were at a lower LSN than + the detached child timeline will be modified to have the child timeline as their + new parent. +- The parent timeline will still exist, but the child will no longer have it as an + ancestor. If this was the last timeline that depended on the parent, then the + parent will become deletable. + +This API's implementation will consist of a series of retryable steps, such that +on failures/timeout it can safely be called again to reach the target state. + +## Example + +### Before + +The user has "rolled back" their project to LSN X, resulting in a "new main" +timeline. The parent "old main" timeline still exists, and they would like +to clean it up. + +They have two other timelines A and B. A is from before the rollback point, +and B is from after the rollback point. + +``` +----"old main" timeline-------X--------------------------------------------> + | | | + |-> child A | | + |-> "new main" timeline | + -> child B + +``` + +### After calling detach ancestor API + +The "new main" timeline is no longer dependent on old main, and neither +is child A, because it had a branch point before X. + +The user may now choose to delete child B and "old main" to get to +a pristine state. Child B is likely to be unwanted since the user +chose to roll back to X, and it branches from after X. However, we +don't assume this in the API; it is up to the user to delete it. + +``` +|----"old main" timeline----------------------------------------------------> + | + | + | + -> child B + +|----"new main" timeline---------> + | + |-> child A + + +``` + +### After removing timelines + +We end up with a totally clean state that leaves no trace that a rollback +ever happened: there is only one root timeline. + +``` +| ----"new main" timeline-----------> + | + |-> child A + + +``` + +## Caveats + +Important things for API users to bear in mind: + +- this API does not delete the parent timeline: you must still do that explicitly. +- if there are other child timelines ahead of the branch point of the detached + child, the parent won't be deletable: you must either delete or detach those + children. +- do _not_ simply loop over all children and detach them all: this can have an + extremely high storage cost. The detach ancestor API is intended for use on a single + timeline to make it the new "main". +- The detach ancestor API should also not be + exposed directly to the user as button/API, because they might decide + to click it for all the children and thereby generate many copies of the + parent's data -- the detach ancestor API should be used as part + of a high level "clean up after rollback" feature. + +## `detach_ancestor` API implementation + +Terms used in the following sections: + +- "the child": the timeline whose ID is specified in the detach ancestor API URL, also + called "new main" in the example. +- "the parent": the parent of "the child". Also called "old main" in the example. +- "the branch point" the ancestor_lsn of "the child" + +### Phase 1: write out adopted layers to S3 + +The child will "adopt" layers from the parent, such that its end state contains +all the parent's history as well as its own. + +For all layers in the parent's layer map whose high LSN is below the branch +point, issue S3 CopyObject requests to duplicate them into the child timeline's +prefix. Do not add them to the child's layer map yet. + +For delta layers in the parent's layer map which straddle the branch point, read them +and write out only content up to the branch point into new layer objects. + +This is a long running operation if the parent has many layers: it should be +implemented in a way that resumes rather than restarting from scratch, if the API +times out and is called again. + +As an optimization, if there are no other timelines that will be adopted into +the child, _and_ the child's image layers already full cover the branch LSN, +then we may skip adopting layers. + +### Phase 2: update the child's index + +Having written out all needed layers in phase 1, atomically link them all +into the child's IndexPart and upload to S3. This may be done while the +child Timeline is still running. + +### Phase 3: modify timelines ancestry + +Modify the child's ancestor to None, and upload its IndexPart to persist the change. + +For all timelines which have the same parent as the child, and have a branch +point lower than our branch point, switch their ancestor_timeline to the child, +and upload their IndexPart to persist the change. + +## Alternatives considered + +### Generate full image layer on child, rather than adopting parent deltas + +This would work for the case of a single child, but would prevent re-targeting +other timelines that depended on the parent. If we detached many children this +way, the storage cost would become prohibitive (consider a 1TB database with +100 child timelines: it would cost 100TiB if they all generated their own image layers). + +### Don't rewrite anything: just fake it in the API + +We could add a layer of indirection that let a child "pretend" that it had no +ancestor, when in reality it still had the parent. The pageserver API could +accept deletion of ancestor timelines, and just update child metadata to make +them look like they have no ancestor. + +This would not achieve the desired reduction in storage cost, and may well be more +complex to maintain than simply implementing the API described in this RFC. + +### Avoid copying objects: enable child index to use parent layers directly + +We could teach IndexPart to store a TimelineId for each layer, such that a child +timeline could reference a parent's layers directly, rather than copying them +into the child's prefix. + +This would impose a cost for the normal case of indices that only target the +timeline's own layers, add complexity, and break the useful simplifying +invariant that timelines "own" their own path. If child timelines were +referencing layers from the parent, we would have to ensure that the parent +never runs GC/compaction again, which would make the API less flexible (the +proposal in this RFC enables deletion of the parent but doesn't require it.) + +## Performance + +### Adopting layers + +- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands + of such requests: this can take up to tens of seconds and will compete for RemoteStorage + semaphore units with other activity on the pageserver. +- If we are running on storage backend that doesn't implement CopyObject, then + this part will be much more expensive as we would stream all layer content + through the pageserver. This is no different to issuing a lot + of reads to a timeline that does not have a warm local cache: it will move + a lot of gigabytes, but that shouldn't break anything. +- Generating truncated layers for delta that straddle the branch point will + require streaming read/write of all the layers in question. + +### Updating timeline ancestry + +The simplest way to update timeline ancestry will probably be to stop and start +all the Timeline objects: this is preferable to the complexity of making their +ancestry mutable at runtime. + +There will be a corresponding "stutter" in the availability of the timelines, +of the order 10-100ms, which is the time taken to upload their IndexPart, and +restart the Timeline. + +# Interaction with other features + +## Concurrent timeline creation + +If new historic timelines are created using the parent as an ancestor while the +detach ancestor API is running, they will not be re-parented to the child. This +doesn't break anything, but it leaves the parent in a state where it might not +be possible to delete it. + +Since timeline creations are an explicit user action, this is not something we need to +worry about as the storage layer: a user who wants to delete their parent timeline will not create +new children, and if they do, they can choose to delete those children to +enable deleting the parent. + +For the least surprise to the user, before starting the detach ancestor branch +operation, the control plane should wait until all branches are created and not +allow any branches to be created before the branch point on the ancestor branch +while the operation is ongoing. + +## WAL based disaster recovery + +WAL based disaster recovery currently supports only restoring of the main +branch. Enabling WAL based disaster recovery in the future requires that we +keep a record which timeline generated the WAL and at which LSN was a parent +detached. Keep a list of timeline ids and the LSN in which they were detached in +the `index_part.json`. Limit the size of the list to 100 first entries, after +which the WAL disaster recovery will not be possible. + +## Sharded tenants + +For sharded tenants, calls to the detach ancestor API will pass through the storage +controller, which will handle them the same as timeline creations: invoke first +on shard zero, and then on all the other shards. diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md new file mode 100644 index 0000000000..239ec58186 --- /dev/null +++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md @@ -0,0 +1,495 @@ +# Safekeeper dynamic membership change + +To quickly recover from safekeeper node failures and do rebalancing we need to +be able to change set of safekeepers the timeline resides on. The procedure must +be safe (not lose committed log) regardless of safekeepers and compute state. It +should be able to progress if any majority of old safekeeper set, any majority +of new safekeeper set and compute are up and connected. This is known as a +consensus membership change. It always involves two phases: 1) switch old +majority to old + new configuration, preventing commits without acknowledge from +the new set 2) bootstrap the new set by ensuring majority of the new set has all +data which ever could have been committed before the first phase completed; +after that switch is safe to finish. Without two phases switch to the new set +which quorum might not intersect with quorum of the old set (and typical case of +ABC -> ABD switch is an example of that, because quorums AC and BD don't +intersect). Furthermore, procedure is typically carried out by the consensus +leader, and so enumeration of configurations which establishes order between +them is done through consensus log. + +In our case consensus leader is compute (walproposer), and we don't want to wake +up all computes for the change. Neither we want to fully reimplement the leader +logic second time outside compute. Because of that the proposed algorithm relies +for issuing configurations on the external fault tolerant (distributed) strongly +consisent storage with simple API: CAS (compare-and-swap) on the single key. +Properly configured postgres suits this. + +In the system consensus is implemented at the timeline level, so algorithm below +applies to the single timeline. + +## Algorithm + +### Definitions + +A configuration is + +``` +struct Configuration { + generation: Generation, // a number uniquely identifying configuration + sk_set: Vec, // current safekeeper set + new_sk_set: Optional>, +} +``` + +Configuration with `new_set` present is used for the intermediate step during +the change and called joint configuration. Generations establish order of +generations: we say `c1` is higher than `c2` if `c1.generation` > +`c2.generation`. + +### Persistently stored data changes + +Safekeeper starts storing its current configuration in the control file. Update +of is atomic, so in-memory value always matches the persistent one. + +External CAS providing storage (let's call it configuration storage here) also +stores configuration for each timeline. It is initialized with generation 1 and +initial set of safekeepers during timeline creation. Executed CAS on it must +never be lost. + +### Compute <-> safekeeper protocol changes + +`ProposerGreeting` message carries walproposer's configuration if it is already +established (see below), else null. `AcceptorGreeting` message carries +safekeeper's current `Configuration`. All further messages (`VoteRequest`, +`VoteResponse`, `ProposerElected`, `AppendRequest`, `AppendResponse`) carry +generation number, of walproposer in case of wp->sk message or of safekeeper in +case of sk->wp message. + +### Safekeeper changes + +Basic rule: once safekeeper observes configuration higher than his own it +immediately switches to it. It must refuse all messages with lower generation +that his. It also refuses messages if it is not member of the current generation +(that is, of either `sk_set` of `sk_new_set`), though it is likely not unsafe to +process them (walproposer should ignore result anyway). + +If there is non null configuration in `ProposerGreeting` and it is higher than +current safekeeper one, safekeeper switches to it. + +Safekeeper sends its current configuration in its first message to walproposer +`AcceptorGreeting`. It refuses all other walproposer messages if the +configuration generation in them is less than its current one. Namely, it +refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In +response it sends its current configuration generation to let walproposer know. + +Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` +accepting `Configuration`. Safekeeper switches to the given conf it is higher than its +current one and ignores it otherwise. In any case it replies with +``` +struct ConfigurationSwitchResponse { + conf: Configuration, + term: Term, + last_log_term: Term, + flush_lsn: Lsn, +} +``` + +### Compute (walproposer) changes + +Basic rule is that joint configuration requires votes from majorities in the +both `set` and `new_sk_set`. + +Compute receives list of safekeepers to connect to from the control plane as +currently and tries to communicate with all of them. However, the list does not +define consensus members. Instead, on start walproposer tracks highest +configuration it receives from `AcceptorGreeting`s. Once it assembles greetings +from majority of `sk_set` and majority of `new_sk_set` (if it is present), it +establishes this configuration as its own and moves to voting. + +It should stop talking to safekeepers not listed in the configuration at this +point, though it is not unsafe to continue doing so. + +To be elected it must receive votes from both majorites if `new_sk_set` is present. +Similarly, to commit WAL it must receive flush acknowledge from both majorities. + +If walproposer hears from safekeeper configuration higher than his own (i.e. +refusal to accept due to configuration change) it simply restarts. + +### Change algorithm + +The following algorithm can be executed anywhere having access to configuration +storage and safekeepers. It is safe to interrupt / restart it and run multiple +instances of it concurrently, though likely one of them won't make +progress then. It accepts `desired_set: Vec` as input. + +Algorithm will refuse to make the change if it encounters previous interrupted +change attempt, but in this case it will try to finish it. + +It will eventually converge if old majority, new majority and configuration +storage are reachable. + +1) Fetch current timeline configuration from the configuration storage. +2) If it is already joint one and `new_set` is different from `desired_set` + refuse to change. However, assign join conf to (in memory) var + `join_conf` and proceed to step 4 to finish the ongoing change. +3) Else, create joint `joint_conf: Configuration`: increment current conf number + `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration + storage by doing CAS on the current generation: change happens only if + current configuration number is still `n`. Apart from guaranteeing uniqueness + of configurations, CAS linearizes them, ensuring that new configuration is + created only following the previous one when we know that the transition is + safe. Failed CAS aborts the procedure. +4) Call `PUT` `configuration` on safekeepers from the current set, + delivering them `joint_conf`. Collecting responses from majority is required + to proceed. If any response returned generation higher than + `joint_conf.generation`, abort (another switch raced us). Otherwise, choose + max `` among responses and establish it as + (in memory) `sync_position`. Also choose max `term` and establish it as (in + memory) `sync_term`. We can't finish the switch until majority of the new set + catches up to this `sync_position` because data before it could be committed + without ack from the new set. Similarly, we'll bump term on new majority + to `sync_term` so that two computes with the same term are never elected. +4) Initialize timeline on safekeeper(s) from `new_sk_set` where it + doesn't exist yet by doing `pull_timeline` from the majority of the + current set. Doing that on majority of `new_sk_set` is enough to + proceed, but it is reasonable to ensure that all `new_sk_set` members + are initialized -- if some of them are down why are we migrating there? +5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. + Success on majority is enough. +6) Repeatedly call `PUT` `configuration` on safekeepers from the new set, + delivering them `joint_conf` and collecting their positions. This will + switch them to the `joint_conf` which generally won't be needed + because `pull_timeline` already includes it and plus additionally would be + broadcast by compute. More importantly, we may proceed to the next step + only when `` on the majority of the new set reached + `sync_position`. Similarly, on the happy path no waiting is not needed because + `pull_timeline` already includes it. However, we should double + check to be safe. For example, timeline could have been created earlier e.g. + manually or after try-to-migrate, abort, try-to-migrate-again sequence. +7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new + safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration + storage under one more CAS. +8) Call `PUT` `configuration` on safekeepers from the new set, + delivering them `new_conf`. It is enough to deliver it to the majority + of the new set; the rest can be updated by compute. + +I haven't put huge effort to make the description above very precise, because it +is natural language prone to interpretations anyway. Instead I'd like to make TLA+ +spec of it. + +Description above focuses on safety. To make the flow practical and live, here a few more +considerations. +1) It makes sense to ping new set to ensure it we are migrating to live node(s) before + step 3. +2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed + it is safe to rollback to the old conf with one more CAS. +3) On step 4 timeline might be already created on members of the new set for various reasons; + the simplest is the procedure restart. There are more complicated scenarious like mentioned + in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving + generations, so seems simpler to treat existing timeline as success. However, this also + has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in + the step 5 is never reached until compute is (re)awaken up to synchronize new member(s). + I don't think we'll observe this in practice, but can add waking up compute if needed. +4) In the end timeline should be locally deleted on the safekeeper(s) which are + in the old set but not in the new one, unless they are unreachable. To be + safe this also should be done under generation number (deletion proceeds only if + current configuration is <= than one in request and safekeeper is not memeber of it). +5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`, + jump to step 7, using it as `new_conf`. + +## Implementation + +The procedure ought to be driven from somewhere. Obvious candidates are control +plane and storage_controller; and as each of them already has db we don't want +yet another storage. I propose to manage safekeepers in storage_controller +because 1) since it is in rust it simplifies simulation testing (more on this +below) 2) it already manages pageservers. + +This assumes that migration will be fully usable only after we migrate all +tenants/timelines to storage_controller. It is discussible whether we want also +to manage pageserver attachments for all of these, but likely we do. + +This requires us to define storcon <-> cplane interface. + +### storage_controller <-> control plane interface + +First of all, control plane should +[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829) +storing safekeepers per timeline instead of per tenant because we can't migrate +tenants atomically. + +The important question is how updated configuration is delivered from +storage_controller to control plane to provide it to computes. As always, there +are two options, pull and push. Let's do it the same push as with pageserver +`/notify-attach` because 1) it keeps storage_controller out of critical compute +start path 2) provides easier upgrade: there won't be such a thing as 'timeline +managed by control plane / storcon', cplane just takes the value out of its db +when needed 3) uniformity. It makes storage_controller responsible for retrying notifying +control plane until it succeeds. + +So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and +updates it in the db if the provided conf generation is higher (the cplane db +should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it +should update db which makes the call successful, and then try to schedule +`apply_config` if possible, it is ok if not. storage_controller +should rate limit calling the endpoint, but likely this won't be needed, as migration +throughput is limited by `pull_timeline`. + +Timeline (branch) creation in cplane should call storage_controller POST +`tenant/:tenant_id/timeline` like it currently does for sharded tenants. +Response should be augmented with `safekeeper_conf: Configuration`. The call +should be retried until succeeds. + +Timeline deletion and tenant deletion in cplane should call appropriate +storage_controller endpoints like it currently does for sharded tenants. The +calls should be retried until they succeed. + +### storage_controller implementation + +Current 'load everything on startup and keep in memory' easy design is fine. +Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16 +byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so +10^6 of timelines shouldn't take more than 100MB. + +Similar to pageserver attachment Intents storage_controller would have in-memory +`MigrationRequest` (or its absense) for each timeline and pool of tasks trying +to make these request reality; this ensures one instance of storage_controller +won't do several migrations on the same timeline concurrently. In the first +version it is simpler to have more manual control and no retries, i.e. migration +failure removes the request. Later we can build retries and automatic +scheduling/migration. `MigrationRequest` is +``` +enum MigrationRequest { + To(Vec), + FinishPending, +} +``` + +`FinishPending` requests to run the procedure to ensure state is clean: current +configuration is not joint and majority of safekeepers are aware of it, but do +not attempt to migrate anywhere. If current configuration fetched on step 1 is +not joint it jumps to step 7. It should be run at startup for all timelines (but +similarly, in the first version it is ok to trigger it manually). + +#### Schema + +`safekeepers` table mirroring current `nodes` should be added, except that for +`scheduling_policy` field (seems like `status` is a better name for it): it is enough +to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3) +`decomissioned`. + +`timelines` table: +``` +table! { + // timeline_id is primary key + timelines (tenant_id, timeline_id) { + timeline_id -> Varchar, + tenant_id -> Varchar, + generation -> Int4, + sk_set -> Array, // list of safekeeper ids + new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf + cplane_notified_generation -> Int4, + } +} +``` + +#### API + +Node management is similar to pageserver: +1) POST `/control/v1/safekeepers` upserts safekeeper. +2) GET `/control/v1/safekeepers` lists safekeepers. +3) GET `/control/v1/safekeepers/:node_id` gets safekeeper. +4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g. + `offline` or `decomissioned`. Initially it is simpler not to schedule any + migrations here. + +Safekeeper deploy scripts should register safekeeper at storage_contorller as +they currently do with cplane, under the same id. + +Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline` +would 1) choose initial set of safekeepers; 2) write to the db initial +`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in +case of conflict; 3) create timeline on the majority of safekeepers (already +created is ok). + +We don't want to block timeline creation when one safekeeper is down. Currently +this is solved by compute implicitly creating timeline on any safekeeper it is +connected to. This creates ugly timeline state on safekeeper when timeline is +created, but start LSN is not defined yet. It would be nice to remove this; to +do that, controller can in the background retry to create timeline on +safekeeper(s) which missed that during initial creation call. It can do that +through `pull_timeline` from majority so it doesn't need to remember +`parent_lsn` in its db. + +Timeline deletion removes the row from the db and forwards deletion to the +current configuration members. Without additional actions deletions might leak, +see below on this; initially let's ignore these, reporting to cplane success if +at least one safekeeper deleted the timeline (this will remove s3 data). + +Tenant deletion repeats timeline deletion for all timelines. + +Migration API: the first version is the simplest and the most imperative: +1) PUT `/control/v1/safekeepers/migrate` schedules `MigrationRequest`s to move +all timelines from one safekeeper to another. It accepts json +``` +{ + "src_sk": u32, + "dst_sk": u32, + "limit": Optional, +} +``` + +Returns list of scheduled requests. + +2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest` + to move single timeline to given set of safekeepers: +``` +{ + "desired_set": Vec, +} +``` + +Returns scheduled request. + +Similar call should be added for the tenant. + +It would be great to have some way of subscribing to the results (apart from +looking at logs/metrics). + +Migration is executed as described above. One subtlety is that (local) deletion on +source safekeeper might fail, which is not a problem if we are going to +decomission the node but leaves garbage otherwise. I'd propose in the first version +1) Don't attempt deletion at all if node status is `offline`. +2) If it failed, just issue warning. +And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and +remove garbage timelines for manual use. It will 1) list all timelines on the +safekeeper 2) compare each one against configuration storage: if timeline +doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can +be deleted under generation number if node is not member of current generation. + +Automating this is untrivial; we'd need to register all potential missing +deletions in the same transaction +which switches configurations. Similarly when timeline is fully deleted to +prevent cplane operation from blocking when some safekeeper is not available +deletion should be also registered. + +One more task pool should infinitely retry notifying control plane about changed +safekeeper sets. + +3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return + current in memory state of the timeline and pending `MigrationRequest`, + if any. + +4) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort` tries to abort the + migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS + (incrementing generation as always). + +#### Dealing with multiple instances of storage_controller + +Operations described above executed concurrently might create some errors but do +not prevent progress, so while we normally don't want to run multiple instances +of storage_controller it is fine to have it temporarily, e.g. during redeploy. + +Any interactions with db update in-memory controller state, e.g. if migration +request failed because different one is in progress, controller remembers that +and tries to finish it. + +## Testing + +`neon_local` should be switched to use storage_controller, playing role of +control plane. + +There should be following layers of tests: +1) Model checked TLA+ spec specifies the algorithm and verifies its basic safety. + +2) To cover real code and at the same time test many schedules we should have + simulation tests. For that, configuration storage, storage_controller <-> + safekeeper communication and pull_timeline need to be mocked and main switch + procedure wrapped to as a node (thread) in simulation tests, using these + mocks. Test would inject migrations like it currently injects + safekeeper/walproposer restars. Main assert is the same -- committed WAL must + not be lost. + +3) Since simulation testing injects at relatively high level points (not + syscalls), it omits some code, in particular `pull_timeline`. Thus it is + better to have basic tests covering whole system as well. Extended version of + `test_restarts_under_load` would do: start background load and do migration + under it, then restart endpoint and check that no reported commits + had been lost. I'd also add one more creating classic network split scenario, with + one compute talking to AC and another to BD while migration from nodes ABC to ABD + happens. + +4) Simple e2e test should ensure that full flow including cplane notification works. + +## Order of implementation and rollout + +Note that +- Control plane parts and integration with it is fully independent from everything else + (tests would use simulation and neon_local). +- There is a lot of infra work making storage_controller aware of timelines and safekeepers + and its impl/rollout should be separate from migration itself. +- Initially walproposer can just stop working while it observers joint configuration. + Such window would be typically very short anyway. + +To rollout smoothly, both walproposer and safekeeper should have flag +`configurations_enabled`; when set to false, they would work as currently, i.e. +walproposer is able to commit on whatever safekeeper set it is provided. Until +all timelines are managed by storcon we'd need to use current script to migrate +and update/drop entries in the storage_controller database if it has any. + +Safekeepers would need to be able to talk both current and new protocol version +with compute to reduce number of computes restarted in prod once v2 protocol is +deployed (though before completely switching we'd need to force this). + +Let's have the following rollout order: +- storage_controller becomes aware of safekeepers; +- storage_controller gets timeline creation for new timelines and deletion requests, but + doesn't manage all timelines yet. Migration can be tested on these new timelines. + To keep control plane and storage_controller databases in sync while control + plane still chooses the safekeepers initially (until all timelines are imported + it can choose better), `TimelineCreateRequest` can get optional safekeepers + field with safekeepers chosen by cplane. +- Then we can import all existing timelines from control plane to + storage_controller and gradually enable configurations region by region. + + +Very rough implementation order: +- Add concept of configurations to safekeepers (including control file), + implement v3 protocol. +- Implement walproposer changes, including protocol. +- Implement storconn part. Use it in neon_local (and pytest). +- Make cplane store safekeepers per timeline instead of per tenant. +- Implement cplane/storcon integration. Route branch creation/deletion + through storcon. Then we can test migration of new branches. +- Finally import existing branches. Then we can drop cplane + safekeeper selection code. Gradually enable configurations at + computes and safekeepers. Before that, all computes must talk only + v3 protocol version. + +## Integration with evicted timelines + +Currently, `pull_timeline` doesn't work correctly with evicted timelines because +copy would point to original partial file. To fix let's just do s3 copy of the +file. It is a bit stupid as generally unnecessary work, but it makes sense to +implement proper migration before doing smarter timeline archival. [Issue](https://github.com/neondatabase/neon/issues/8542) + +## Possible optimizations + +Steps above suggest walproposer restart (with re-election) and thus reconnection +to safekeepers. Since by bumping term on new majority we ensure that leader +terms are unique even across generation switches it is possible to preserve +connections. However, it is more complicated, reconnection is very fast and it +is much more important to avoid compute restart than millisecond order of write +stall. + +Multiple joint consensus: algorithm above rejects attempt to change membership +while another attempt is in progress. It is possible to overlay them and AFAIK +Aurora does this but similarly I don't think this is needed. + +## Misc + +We should use Compute <-> safekeeper protocol change to include other (long +yearned) modifications: +- send data in network order to make arm work. +- remove term_start_lsn from AppendRequest +- add horizon to TermHistory +- add to ProposerGreeting number of connection from this wp to sk diff --git a/docs/rfcs/035-timeline-archive.md b/docs/rfcs/035-timeline-archive.md new file mode 100644 index 0000000000..c834216962 --- /dev/null +++ b/docs/rfcs/035-timeline-archive.md @@ -0,0 +1,507 @@ +# Timeline Archival + +## Summary + +This RFC describes a mechanism for pageservers to eliminate local storage + compute work +for timelines which are not in use, in response to external API calls to "archive" a timeline. + +The archived state roughly corresponds to fully offloading a timeline to object storage, such +that its cost is purely the cost of that object storage. + +## Motivation + +Archived timelines serve multiple purposes: +- Act as a 'snapshot' for workloads that would like to retain restorable copies of their + database from longer ago than their PITR window. +- Enable users to create huge numbers of branches (e.g. one per github PR) without having + to diligently clean them up later to avoid overloading the pageserver (currently we support + up to ~500 branches per tenant). + +### Prior art + +Most storage and database systems have some form of snapshot, which can be implemented several ways: +1. full copies of data (e.g. an EBS snapshot to S3) +2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS. +3. a series of snapshots which are CoW or de-duplicated relative to one another. + +Today's Neon branches are approximately like `2.`, although due to implementation details branches +often end up storing much more data than they really need, as parent branches assume that all data +at the branch point is needed. The layers pinned in the parent branch may have a much larger size +than the physical size of a compressed image layer representing the data at the branch point. + +## Requirements + +- Enter & exit the archived state in response to external admin API calls +- API calls to modify the archived state are atomic and durable +- An archived timeline should eventually (once out of PITR window) use an efficient compressed + representation, and avoid retaining arbitrarily large data in its parent branch. +- Remote object GETs during tenant start may be O(N) with the number of _active_ branches, + but must not scale with the number of _archived_ branches. +- Background I/O for archived branches should only be done a limited number of times to evolve them + to a long-term-efficient state (e.g. rewriting to image layers). There should be no ongoing "housekeeping" + overhead for archived branches, including operations related to calculating sizes for billing. +- The pageserver should put no load on the safekeeper for archived branches. +- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch + to a performant state in a short time (linear with the branch's logical size) + +## Non Goals + +- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored + in Neon's internal format. +- Compute cold starts after activating an archived branch will not have comparable performance to + cold starts on an active branch. +- Archived branches will not use any new/additional compression or de-duplication beyond what + is already implemented for image layers (zstd per page). +- The pageserver will not "auto start" archived branches in response to page_service API requests: they + are only activated explicitly via the HTTP API. +- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will + remain on local disk, although existing eviction mechanisms will remove any segments from local disk. +- We will not expose any prometheus metrics for archived timelines, or make them visible in any + detailed HTTP APIs other than the specific API for listing archived timelines. +- A parent branch may not be archived unless all its children are. + +## Impacted Components + +pageserver, storage controller + +## Terminology + +**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller +may assume that this branch is now very cheap to store, although this may not be physically so until the +branch proceeds to the offloaded state. + +**Active** branches are branches which are available for use by page_service clients, and have a relatively +high cost due to consuming local storage. + +**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such +that they now consume minimal runtime resources and have a cost similar to the cost of object storage. + +**Activate** (verb): transition from Archived to Active + +**Archive** (verb): transition from Active to Archived + +**Offload** (verb): transition from Archived to Offloaded + +**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load. + +**Warm up** (verb): operation done on an active branch, by downloading its active layers. Once a branch is +warmed up, good performance will be available to page_service clients. + +## Implementation + +### High level flow + +We may think of a timeline which is archived and then activated as proceeding through a series of states: + +```mermaid +stateDiagram + [*] --> Active(warm) + Active(warm) --> Archived + Archived --> Offloaded + Archived --> Active(warm) + Offloaded --> Active(cold) + Active(cold) --> Active(warm) +``` + +Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles +of branches will be: +- Very frequent: Short lived branches: Active -> Deleted +- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted +- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active + +These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination +of: +- the timeline's lifecycle state: active or archived, stored in the timeline's index +- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the + manifest of offloaded timelines. +- cache state (whether it's warm or cold). + +### Storage format changes + +There are two storage format changes: +1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to + be considered active or archived. +2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load + at startup (and is available for storing other small, rarely changing tenant-wide attributes in future) + +The manifest object will have a format like this: +``` +{ + "offload_timelines": [ + { + "timeline_id": ... + "last_record_lsn": ... + "last_record_lsn_time": ... + "pitr_interval": ... + "last_gc_lsn": ... # equal to last_record_lsn if this branch has no history (i.e. a snapshot) + "logical_size": ... # The size at last_record_lsn + "physical_size" ... + "parent": Option<{ + "timeline_id"... + "lsn"... # Branch point LSN on the parent + "requires_data": bool # True if this branch depends on layers in its parent, identify it here + + }> + } + ] +} +``` + +The information about a timeline in its offload state is intentionally minimal: just enough to decide: +- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this + by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn. +- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing + layers that the archived branch depends on +- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request + is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then + we don't need to go to S3 for the deletion. +- How much archived space to report in consumption metrics + +The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total +set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded` +(offloaded timelines). + +For split-brain protection, the manifest object will be written with a generation suffix, in the same way as +index_part objects are (see [generation numbers RFC](025-generation-numbers.md)). This will add some complexity, but +give us total safety against two pageservers with the same tenant attached fighting over the object. Existing code +for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover +the manifest file. + +### API & Timeline state + +Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart. This will +be controlled by a new per-timeline `configure` endpoint. This is intentionally generic naming, which +may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval +a per-timeline configuration). + +`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure` +``` +{ + 'state': 'active|archive' +} +``` + +When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded. + +When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part, +**and** the `Timeline` object has been instantiated and activated. This will require reading the timeline's +index, but not any data: it should be about as fast as a couple of small S3 requests. + +The API will be available with identical path via the storage controller: calling this on a sharded tenant +will simply map the API call to all the shards. + +Archived timelines may never have descendent timelines which are active. This will be enforced at the API level, +such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires +that all its descendents are archived. It is the callers responsibility to walk the hierarchy of timelines +in the proper order if they would like to archive whole trees of branches. + +Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically +for archived timelines will be added: this is for use in support/debug: + +``` +GET /v1/tenants/{tenant_id}/archived_timelines + +{ + ...same per-timeline content as the tenant manifest... +} + +``` + +### Tenant attach changes + +Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline +we load their index_part.json. To avoid the number of GETs scaling linearly with the number of archived +timelines, we must have a single object that tells us which timelines do not need to be loaded. The +number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic +because each request covers 1000 timelines. + +This is **not** literally the same as the set of timelines who have state=archived. Rather, it is +the set of timelines which have been offloaded in the background after their state was set to archived. + +We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't +exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need +to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying +to delete an offloaded timeline. + +### Warm-up API + +`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234` + +This API will be similar to the existing `download_remote_layers` API, but smarter: +- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read) +- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress + of downloads, so that the caller can poll. + +The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set +of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers +can possibly be read from these LSNs. This concept of layer visibility is more generally useful for cache +eviction and heatmaps, as well as in this specific case of warming up a timeline. + +The caller does not have to wait for the warm up API, or call it at all. But it is strongly advised +to call it, because otherwise populating local contents for a timeline can take a long time when waiting +for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite +volatile. + +### Background work + +Archived branches are not subject to normal compaction. Instead, when the compaction loop encounters +an archived branch, it will consider rewriting the branch to just image layers if the branch has no history +([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk +if its state permits that. + +Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider +optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR +has elapsed and it can now be rewritten to image layers. + +#### Archive branch offload + +Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do +any actual work. + +This work is done in the background compaction loop. It makes sense to tag this work on to the compaction +loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency. + +The condition for offload is simple: + - a `Timeline` object exists with state `Archived` + - the timeline does not have any non-offloaded children. + + Regarding the condition that children must be offloaded, this will always be eventually true, because + we enforce at the API level that children of archived timelines must themselves be archived, and all + archived timelines will eventually be offloaded. + +Offloading a timeline is simple: +- Read the timeline's attributes that we will store in its offloaded state (especially its logical size) +- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it) +- Erase all the timeline's content from local storage (`remove_dir_all` on its path) +- Write the tenant manifest to S3 to prevent this timeline being loaded on next start. + +#### Archive branch optimization (flattening) + +When we offloaded a branch, it might have had some history that prevented rewriting it to a single +point in time set of image layers. For example, a branch might have several days of writes and a 7 +day PITR: when we archive it, it still has those days of history. + +Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by: +- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing + a point in time compared with delta layers +- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor + for data, i.e. the ancestor is free to GC layers files at+below the branch point + +Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the +branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes +a true snapshot at that LSN. + +It is not always more efficient to flatten a branch than to keep some extra history on the parent: this +is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper) + +Archive branch optimization should be done _before_ background offloads during compaction, because there may +be timelines which are ready to be offloaded but also would benefit from the optimization step before +being offloaded. For example, a branch which has already fallen out of PITR window and has no history +of its own may be immediately re-written as a series of image layers before being offloaded. + +### Consumption metrics + +Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating +that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived +vs. ordinary content. + +Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size` +variant of `MetricsKey`: receivers are then free to bill on this metric as they please. + +### Secondary locations + +Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby +when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents +will be dropped from secondary locations. + +### Sharding + +Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in +the same way that timeline creation and deletion is done. There are no special rules about ordering: +the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline. + +Since consumption metrics are only transmitted from shard zero, the state of archival on this shard +will be authoritative for consumption metrics. + +## Error cases + +### Errors in sharded tenants + +If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed +state, where a timeline is archived on some shards but not on others. + +We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline +are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest). +In the transient case callers are expected to retry until success, or to make appropriate API calls to clear +up their mistake. We rely on this good behavior of callers to eventually get timelines into a consistent +state across all shards. If callers do leave a timeline in an inconsistent state across shards, this doesn't +break anything, it's just "weird". + +This is similar to the status quo for timeline creation and deletion: callers are expected to retry +these operations until they succeed. + +### Archiving/activating + +Archiving/activating a timeline can fail in a limited number of ways: +1. I/O error storing/reading the timeline's updated index + - These errors are always retryable: a fundamental design assumption of the pageserver is that remote + storage errors are always transient. +2. NotFound if the timeline doesn't exist + - Callers of the API are expected to avoid calling deletion and archival APIs concurrently. + - The storage controller has runtime locking to prevent races such as deleting a timeline while + archiving it. +3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated + - Callers are expected to do their own checks to avoid hitting this case. If they make + a mistake and encounter this error, they should give up. + +### Offloading + +Offloading can only fail if remote storage is unavailable, which would prevent us from writing the +tenant manifest. In such error cases, we give up in the expectation that offloading will be tried +again at the next iteration of the compaction loop. + +### Archive branch optimization + +Optimization is a special form of compaction, so can encounter all the same errors as regular compaction +can: it should return Result<(), CompactionError>, and as with compaction it will be retried on +the next iteration of the compaction loop. + +## Optimizations + +### Delaying storage optimization if retaining parent layers is cheaper + +Optimizing archived branches to image layers and thereby enabling parent branch GC to progress +is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they +are offloaded to S3 they're totally safe, inert things. + +However, in some cases it can be advantageous to retain extra history on their parent branch rather +than flattening the archived branch. For example, if a 1TB parent branch is rather slow-changing (1GB +of data per day), and archive branches are being created nightly, then writing out full 1TB image layers +for each nightly branch is inefficient compared with just keeping more history on the main branch. + +Getting this right requires consideration of: +- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to + write out extra image layers, then it might make more sense to just write out the image layers on + the archived branch. +- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes + the layer map (and index_part) bigger. There are practical limits beyond which writing an indefinitely + large layer map can cause problems elsewhere. + +This optimization can probably be implemented quite cheaply with some basic heuristics like: +- don't bother doing optimization on an archive branch if the LSN distance between + its branch point and the end of the PITR window is <5% of the logical size of the archive branch. +- ...but, Don't keep more history on the main branch than double the PITR + +### Creating a timeline in archived state (a snapshot) + +Sometimes, one might want to create a branch with no history, which will not be written to +before it is archived. This is a snapshot, although we do not require a special snapshot API, +since a snapshot can be represented as a timeline with no history. + +This can be accomplished by simply creating a timeline and then immediately archiving it, but +that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage +broker to try and ingest WAL, before being shutdown in the subsequent archival call. To explicitly +support this common special case, we may add a parameter to the timeline creation API which +creates a timeline directly into the archived state. + +Such a timeline creation will do exactly two I/Os at creation time: +- write the index_part object to record the timeline's existence +- when the timeline is offloaded in the next iteration of the compaction loop (~20s later), + write the tenant manifest. + +Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake +up the 'snapshot' branch and write out image layers. + +## Future Work + +### Enabling `fullbackup` dumps from archive branches + +It would be useful to be able to export an archive branch to another system, or for use in a local +postgres database. + +This could be implemented as a general capability for all branches, in which case it would "just work" +for archive branches by activating them. However, downloading all the layers in a branch just to generate +a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches +which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk. + +Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem +is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup +stream to S3 in an intermediate format and, then having one node stitch them together). + +### Tagging layers from archived branches + +When we know a layer is an image layer written for an archived branch that has fallen off the PITR window, +we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even +cheaper storage. + +This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver +external hints on which branches are likely to be reactivated, and which branches are good candidates for +tagging for low performance storage. + +Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes. Other clouds' object +stores have similar mechanisms. + +### Storing sequences of archive branches as deltas + +When archived branches are used as scheduled snapshots, we could store them even more efficiently +by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the +storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified +pages). This is the kind of encoding that many backup storage systems use. + +The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding +vs. just writing out a simple stream of the entire database. For smaller databases, writing out a full +copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds, +so the complexity tradeoff of diff-encoding it is dubious). + +One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the +pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that +we can say: "A branch exists from Monday night. I have Monday night's data still active in the main branch, +so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's +delta snapshot". + +Clearly this all requires careful housekeeping to retain the relationship between branches that depend on +each other: perhaps this would be done by making the archive branches have child/parent relationships with +each other, or perhaps we would permit them to remain children of their original parent, but additionally +have a relationship with the snapshot they're encoded relative to. + +Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring +out how frequently to write a full copy is important. This is essentially a zoomed-out version of what +we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline. + + +## FAQ/Alternatives + +### Store all timelines in the tenant manifest + +Rather than special-casing offloaded timelines in the offload manifest, we could store a total +manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on +startup. + +That would be a more invasive change (require hooking in to timeline creation), and would +generate much more I/O to this manifest for tenants that had many branches _and_ frequent +create/delete cycles for short lived branches. Restricting the manifest to offloaded timelines +means that we only have to cope with the rate at which long-lived timelines are archived, rather +than the rate at which sort lived timelines are created & destroyed. + +### Automatically archiving/activating timelines without external API calls + +We could implement TTL driven offload of timelines, waking them up when a page request +arrives. + +This has downsides: +- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't + know which of their branches are in this state, and might get a surprise when they try + to use such a branch. +- Price fluctuation: if the archival of a branch is used in end user pricing, then users + prefer clarity & consistency. Ideally a branch's storage should cost the same from the moment it + is created, rather than having a usage-dependency storage price. +- Complexity: enabling the page service to call up into the Tenant to activate a timeline + would be awkward, compared with an external entry point. + +### Make offloaded a state of Timeline + +To reduce the operator-facing complexity of having some timelines APIs that only return +non-offloaded timelines, we could build the offloaded state into the Timeline type. + +`timeline.rs` is already one of the most egregiously long source files in the tree, so +this is rejected on the basis that we need to avoid making that complexity worse. \ No newline at end of file diff --git a/docs/rfcs/036-physical-replication.md b/docs/rfcs/036-physical-replication.md new file mode 100644 index 0000000000..41aced0545 --- /dev/null +++ b/docs/rfcs/036-physical-replication.md @@ -0,0 +1,265 @@ +# Physical Replication + +This RFC is a bit special in that we have already implemented physical +replication a long time ago. However, we never properly wrote down all +the decisions and assumptions, and in the last months when more users +have started to use the feature, numerous issues have surfaced. + +This RFC documents the design decisions that have been made. + +## Summary + +PostgreSQL has a feature called streaming replication, where a replica +streams WAL from the primary and continuously applies it. It is also +known as "physical replication", to distinguish it from logical +replication. In PostgreSQL, a replica is initialized by taking a +physical backup of the primary. In Neon, the replica is initialized +from a slim "base backup" from the pageserver, just like a primary, +and the primary and the replicas connect to the same pageserver, +sharing the storage. + +There are two kinds of read-only replicas in Neon: +- replicas that follow the primary, and +- "static" replicas that are pinned at a particular LSN. + +A static replica is useful e.g. for performing time-travel queries and +running one-off slow queries without affecting the primary. A replica +that follows the primary can be used e.g. to scale out read-only +workloads. + +## Motivation + +Read-only replicas allow offloading read-only queries. It's useful for +isolation, if you want to make sure that read-only queries don't +affect the primary, and it's also an easy way to provide guaranteed +read-only access to an application, without having to mess with access +controls. + +## Non Goals (if relevant) + +This RFC is all about WAL-based *physical* replication. Logical +replication is a different feature. + +Neon also has the capability to launch "static" read-only nodes which +do not follow the primary, but are pinned to a particular LSN. They +can be used for long-running one-off queries, or for Point-in-time +queries. They work similarly to read replicas that follow the primary, +but some things are simpler: there are no concerns about cache +invalidation when the data changes on the primary, or worrying about +transactions that are in-progress on the primary. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +- Control plane launches the replica +- Replica Postgres instance connects to the safekeepers, to stream the WAL +- The primary does not know about the standby, except for the hot standby feedback +- The primary and replicas all connect to the same pageservers + + +# Context + +Some useful things to know about hot standby and replicas in +PostgreSQL. + +## PostgreSQL startup sequence + +"Running" and "start up" terms are little imprecise. PostgreSQL +replica startup goes through several stages: + +1. First, the process is started up, and various initialization steps + are performed, like initializing shared memory. If you try to + connect to the server in this stage, you get an error: ERROR: the + database system is starting up. This stage happens very quickly, no + +2. Then the server reads the checpoint record from the WAL and starts + the WAL replay starting from the checkpoint. This works differently + in Neon: we start the WAL replay at the basebackup LSN, not from a + checkpoint! If you connect to the server in this state, you get an + error: ERROR: the database system is not yet accepting + connections. We proceed to the next stage, when the WAL replay sees + a running-xacts record. Or in Neon, the "CLOG scanning" mechanism + can allow us to move directly to next stage, with all the caveats + listed in this RFC. + +3. When the running-xacts information is established, the server + starts to accept connections normally. + +From PostgreSQL's point of view, the server is already running in +stage 2, even though it's not accepting connections yet. Our +`compute_ctl` does not consider it as running until stage 3. If the +transition from stage 2 to 3 doesn't happen fast enough, the control +plane will mark the start operation as failed. + + +## Decisions, Issues + +### Cache invalidation in replica + +When a read replica follows the primary in PostgreSQL, it needs to +stream all the WAL from the primary and apply all the records, to keep +the local copy of the data consistent with the primary. In Neon, the +replica can fetch the updated page versions from the pageserver, so +it's not necessary to apply all the WAL. However, it needs to ensure +that any pages that are currently in the Postgres buffer cache, or the +Local File Cache, are either updated, or thrown away so that the next +read of the page will fetch the latest version. + +We choose to apply the WAL records for pages that are already in the +buffer cache, and skip records for other pages. Somewhat arbitrarily, +we also apply records affecting catalog relations, fetching the old +page version from the pageserver if necessary first. See +`neon_redo_read_buffer_filter()` function. + +The replica wouldn't necessarily need to see all the WAL records, only +the records that apply to cached pages. For simplicity, we do stream +all the WAL to the replica, and the replica simply ignores WAL records +that require no action. + +Like in PostgreSQL, the read replica maintains a "replay LSN", which +is the LSN up to which the replica has received and replayed the +WAL. The replica can lag behind the primary, if it cannot quite keep +up with the primary, or if a long-running query conflicts with changes +that are about to be applied, or even intentionally if the user wishes +to see delayed data (see recovery_min_apply_delay). It's important +that the replica sees a consistent view of the whole cluster at the +replay LSN, when it's lagging behind. + +In Neon, the replica connects to a safekeeper to get the WAL +stream. That means that the safekeepers must be able to regurgitate +the original WAL as far back as the replay LSN of any running read +replica. (A static read-only node that does not follow the primary +does not require a WAL stream however). The primary does not need to +be running, and when it is, the replicas don't incur any extra +overhead to the primary (see hot standby feedback though). + +### In-progress transactions + +In PostgreSQL, when a hot standby server starts up, it cannot +immediately open up for queries (see [PostgreSQL startup +sequence]). It first needs to establish a complete list of in-progress +transactions, including subtransactions, that are running at the +primary, at the current replay LSN. Normally that happens quickly, +when the replica sees a "running-xacts" WAL record, because the +primary writes a running-xacts WAL record at every checkpoint, and in +PostgreSQL the replica always starts the WAL replay from a checkpoint +REDO point. (A shutdown checkpoint WAL record also implies that all +the non-prepared transactions have ended.) If there are a lot of +subtransactions in progress, however, the standby might need to wait +for old transactions to complete before it can open up for queries. + +In Neon that problem is worse: a replica can start at any LSN, so +there's no guarantee that it will see a running-xacts record any time +soon. In particular, if the primary is not running when the replica is +started, it might never see a running-xacts record. + +To make things worse, we initially missed this issue, and always +started accepting queries at replica startup, even if it didn't have +the transaction information. That could lead to incorrect query +results and data corruption later. However, as we fixed that, we +introduced a new problem compared to what we had before: previously +the replica would always start up, but after fixing that bug, it might +not. In a superficial way, the old behavior was better (but could lead +to serious issues later!). That made fixing that bug was very hard, +because as we fixed it, we made things (superficially) worse for +others. + +See https://github.com/neondatabase/neon/pull/7288 which fixed the +bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323 +and https://github.com/neondatabase/neon/pull/8484 to try to claw back +the cases that started to cause trouble as fixing it. As of this +writing, there are still cases where a replica might not immediately +start up, causing the control plane operation to fail, the remaining +issues are tracked in https://github.com/neondatabase/neon/issues/6211. + +One long-term fix for this is to switch to using so-called CSN +snapshots in read replica. That would make it unnecessary to have the +full in-progress transaction list in the replica at startup time. See +https://commitfest.postgresql.org/48/4912/ for a work-in-progress +patch to upstream to implement that. + +Another thing we could do is to teach the control plane about that +distinction between "starting up" and "running but haven't received +running-xacts information yet", so that we could keep the replica +waiting longer in that stage, and also give any client connections the +same `ERROR: the database system is not yet accepting connections` +error that you get in standalone PostgreSQL in that state. + + +### Recovery conflicts and Hot standby feedback + +It's possible that a tuple version is vacuumed away in the primary, +even though it is still needed by a running transactions in the +replica. This is called a "recovery conflict", and PostgreSQL provides +various options for dealing with it. By default, the WAL replay will +wait up to 30 s for the conflicting query to finish. After that, it +will kill the running query, so that the WAL replay can proceed. + +Another way to avoid the situation is to enable the +[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK) +option. When it is enabled, the primary will refrain from vacuuming +tuples that are still needed in the primary. That means potentially +bloating the primary, which violates the usual rule that read replicas +don't affect the operations on the primary, which is why it's off by +default. We leave it to users to decide if they want to turn it on, +same as PostgreSQL. + +Neon supports `hot_standby_feedback` by passing the feedback messages +from the replica to the safekeepers, and from safekeepers to the +primary. + +### Relationship of settings between primary and replica + +In order to enter hot standby mode, some configuration options need to +be set to the same or larger values in the standby, compared to the +primary. See [explanation in the PostgreSQL +docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN) + +In Neon, we have this problem too. To prevent customers from hitting +it, the control plane automatically adjusts the settings of a replica, +so that they match or exceed the primary's settings (see +https://github.com/neondatabase/cloud/issues/14903). However, you +can still hit the issue if the primary is restarted with larger +settings, while the replica is running. + + +### Interaction with Pageserver GC + +The read replica can lag behind the primary. If there are recovery +conflicts or the replica cannot keep up for some reason, the lag can +in principle grow indefinitely. The replica will issue all GetPage +requests to the pageservers at the current replay LSN, and needs to +see the old page versions. + +If the retention period in the pageserver is set to be small, it may +have already garbage collected away the old page versions. That will +cause read errors in the compute, and can mean that the replica cannot +make progress with the replication anymore. + +There is a mechanism for replica to pass information about its replay +LSN to the pageserver, so that the pageserver refrains from GC'ing +data that is still needed by the standby. It's called +'standby_horizon' in the pageserver code, see +https://github.com/neondatabase/neon/pull/7368. A separate "lease" +mechanism also is in the works, where the replica could hold a lease +on the old LSN, preventing the pageserver from advancing the GC +horizon past that point. The difference is that the standby_horizon +mechanism relies on a feedback message from replica to safekeeper, +while the least API is exposed directly from the pageserver. A static +read-only node is not connected to safekeepers, so it cannot use the +standby_horizon mechanism. + + +### Synchronous replication + +We haven't put any effort into synchronous replication yet. + +PostgreSQL provides multiple levels of synchronicity. In the weaker +levels, a transaction is not acknowledged as committed to the client +in the primary until the WAL has been streamed to a replica or flushed +to disk there. Those modes don't make senses in Neon, because the +safekeepers handle durability. + +`synchronous_commit=remote_apply` mode would make sense. In that mode, +the commit is not acknowledged to the client until it has been +replayed in the replica. That ensures that after commit, you can see +the commit in the replica too (aka. read-your-write consistency). diff --git a/docs/settings.md b/docs/settings.md index 817f97d8ba..12a6a4c171 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -134,7 +134,7 @@ depends on that, so if you change it, bad things will happen. #### page_cache_size -Size of the page cache, to hold materialized page versions. Unit is +Size of the page cache. Unit is number of 8 kB blocks. The default is 8192, which means 64 MB. #### max_file_descriptors diff --git a/docs/storage_controller.md b/docs/storage_controller.md index daf4d0c8b7..6d2ef929a4 100644 --- a/docs/storage_controller.md +++ b/docs/storage_controller.md @@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration: - Use `diesel migration generate ` to create a new migration - Populate the SQL files in the `migrations/` subdirectory - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically. - - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service` + - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller` - Commit the migration files and the changes to schema.rs - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again. - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed. diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md index 3acb4e18cb..b6b90d90c2 100644 --- a/docs/synthetic-size.md +++ b/docs/synthetic-size.md @@ -21,9 +21,9 @@ implementation where we keep more data than we would need to, do not change the synthetic size or incur any costs to the user. The synthetic size is calculated for the whole project. It is not -straightforward to attribute size to individual branches. See "What is -the size of an individual branch?" for discussion on those -difficulties. +straightforward to attribute size to individual branches. See [What is +the size of an individual branch?](#what-is-the-size-of-an-individual-branch) +for a discussion of those difficulties. The synthetic size is designed to: @@ -40,8 +40,9 @@ The synthetic size is designed to: - logical size is the size of a branch *at a given point in time*. It's the total size of all tables in all databases, as you see with "\l+" in psql for example, plus the Postgres SLRUs and some - small amount of metadata. NOTE that currently, Neon does not include - the SLRUs and metadata in the logical size. See comment to `get_current_logical_size_non_incremental()`. + small amount of metadata. Note that currently, Neon does not include + the SLRUs and metadata in the logical size. Refer to the comment in + [`get_current_logical_size_non_incremental()`](/pageserver/src/pgdatadir_mapping.rs#L813-L814). - a "point in time" is defined as an LSN value. You can convert a timestamp to an LSN, but the storage internally works with LSNs. diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index b377bd2cce..8aaa481f8c 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -14,5 +14,3 @@ regex.workspace = true utils = { path = "../utils" } remote_storage = { version = "0.1", path = "../remote_storage/" } - -workspace_hack.workspace = true diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 1c4ee2089f..883c624f71 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -96,12 +96,6 @@ pub struct ComputeSpec { // Stripe size for pageserver sharding, in pages #[serde(default)] pub shard_stripe_size: Option, - - // When we are starting a new replica in hot standby mode, - // we need to know if the primary is running. - // This is used to determine if replica should wait for - // RUNNING_XACTS from primary or not. - pub primary_is_running: Option, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index 3f290821c2..a40b74b952 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -6,10 +6,8 @@ license = "Apache-2.0" [dependencies] anyhow.workspace = true -chrono.workspace = true +chrono = { workspace = true, features = ["serde"] } rand.workspace = true serde.workspace = true serde_with.workspace = true utils.workspace = true - -workspace_hack.workspace = true diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml index 6f442d8243..0c4be90267 100644 --- a/libs/desim/Cargo.toml +++ b/libs/desim/Cargo.toml @@ -14,5 +14,3 @@ parking_lot.workspace = true hex.workspace = true scopeguard.workspace = true smallvec = { workspace = true, features = ["write"] } - -workspace_hack.workspace = true diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index 0bd804051c..f87e7b8e3a 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -12,8 +12,6 @@ chrono.workspace = true twox-hash.workspace = true measured.workspace = true -workspace_hack.workspace = true - [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true measured-process.workspace = true diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs index f53511ab5c..723916a742 100644 --- a/libs/metrics/src/hll.rs +++ b/libs/metrics/src/hll.rs @@ -13,11 +13,7 @@ use std::{ use measured::{ label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, - metric::{ - group::{Encoding, MetricValue}, - name::MetricNameEncoder, - Metric, MetricType, MetricVec, - }, + metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec}, text::TextEncoder, LabelGroup, }; @@ -144,6 +140,7 @@ impl HyperLogLogState { }) } } + impl measured::metric::MetricEncoding> for HyperLogLogState { @@ -182,12 +179,13 @@ impl measured::metric::MetricEncoding = Lazy::new(|| { .expect("Failed to register maxrss_kb int gauge") }); -pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ - 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, -]; +/// Most common fsync latency is 50 µs - 100 µs, but it can be much higher, +/// especially during many concurrent disk operations. +pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] = + &[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0]; pub struct BuildInfo { pub revision: &'static str, @@ -170,8 +171,11 @@ fn write_gauge( labels: impl LabelGroup, name: impl MetricNameEncoder, enc: &mut Enc, -) -> Result<(), Enc::Err> { - enc.write_metric_value(name, labels, MetricValue::Int(x)) +) -> Result<(), Enc::Err> +where + GaugeState: MetricEncoding, +{ + GaugeState::new(x).collect_into(&(), labels, name, enc) } #[derive(Default)] @@ -543,15 +547,6 @@ impl Encoding for Inc { fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { self.0.write_help(name, help) } - - fn write_metric_value( - &mut self, - name: impl MetricNameEncoder, - labels: impl LabelGroup, - value: MetricValue, - ) -> Result<(), Self::Err> { - self.0.write_metric_value(name, labels, value) - } } impl MetricEncoding> for MeasuredCounterPairState @@ -578,15 +573,6 @@ impl Encoding for Dec { fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { self.0.write_help(name, help) } - - fn write_metric_value( - &mut self, - name: impl MetricNameEncoder, - labels: impl LabelGroup, - value: MetricValue, - ) -> Result<(), Self::Err> { - self.0.write_metric_value(name, labels, value) - } } /// Write the dec counter to the encoder diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 3bba89c76d..cb28359ac3 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -21,11 +21,9 @@ hex.workspace = true humantime.workspace = true thiserror.workspace = true humantime-serde.workspace = true -chrono.workspace = true +chrono = { workspace = true, features = ["serde"] } itertools.workspace = true -workspace_hack.workspace = true - [dev-dependencies] bincode.workspace = true rand.workspace = true diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index a0d10dc665..a50707a1b8 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -1,4 +1,6 @@ +use std::collections::HashSet; use std::str::FromStr; +use std::time::{Duration, Instant}; /// Request/response types for the storage controller /// API (`/control/v1` prefix). Implemented by the server @@ -11,6 +13,27 @@ use crate::{ shard::{ShardStripeSize, TenantShardId}, }; +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantCreateRequest { + pub new_tenant_id: TenantShardId, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub generation: Option, + + // If omitted, create a single shard with TenantShardId::unsharded() + #[serde(default)] + #[serde(skip_serializing_if = "ShardParameters::is_unsharded")] + pub shard_parameters: ShardParameters, + + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub placement_policy: Option, + + #[serde(flatten)] + pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it +} + #[derive(Serialize, Deserialize)] pub struct TenantCreateResponseShard { pub shard_id: TenantShardId, @@ -66,7 +89,7 @@ pub struct TenantLocateResponse { pub shard_params: ShardParameters, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Debug)] pub struct TenantDescribeResponse { pub tenant_id: TenantId, pub shards: Vec, @@ -89,7 +112,7 @@ pub struct NodeDescribeResponse { pub listen_pg_port: u16, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Debug)] pub struct TenantDescribeResponseShard { pub tenant_shard_id: TenantShardId, @@ -129,11 +152,16 @@ impl UtilizationScore { } } -#[derive(Serialize, Deserialize, Clone, Copy, Debug)] +#[derive(Serialize, Clone, Copy, Debug)] #[serde(into = "NodeAvailabilityWrapper")] pub enum NodeAvailability { // Normal, happy state Active(UtilizationScore), + // Node is warming up, but we expect it to become available soon. Covers + // the time span between the re-attach response being composed on the storage controller + // and the first successful heartbeat after the processing of the re-attach response + // finishes on the pageserver. + WarmingUp(Instant), // Offline: Tenants shouldn't try to attach here, but they may assume that their // secondary locations on this node still exist. Newly added nodes are in this // state until we successfully contact them. @@ -143,7 +171,10 @@ pub enum NodeAvailability { impl PartialEq for NodeAvailability { fn eq(&self, other: &Self) -> bool { use NodeAvailability::*; - matches!((self, other), (Active(_), Active(_)) | (Offline, Offline)) + matches!( + (self, other), + (Active(_), Active(_)) | (Offline, Offline) | (WarmingUp(_), WarmingUp(_)) + ) } } @@ -155,6 +186,7 @@ impl Eq for NodeAvailability {} #[derive(Serialize, Deserialize, Clone, Copy, Debug)] pub enum NodeAvailabilityWrapper { Active, + WarmingUp, Offline, } @@ -164,6 +196,7 @@ impl From for NodeAvailability { // Assume the worst utilisation score to begin with. It will later be updated by // the heartbeats. NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()), + NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()), NodeAvailabilityWrapper::Offline => NodeAvailability::Offline, } } @@ -173,6 +206,7 @@ impl From for NodeAvailabilityWrapper { fn from(val: NodeAvailability) -> Self { match val { NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active, + NodeAvailability::WarmingUp(_) => NodeAvailabilityWrapper::WarmingUp, NodeAvailability::Offline => NodeAvailabilityWrapper::Offline, } } @@ -261,6 +295,39 @@ pub enum PlacementPolicy { #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateResponse {} +/// Metadata health record posted from scrubber. +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthRecord { + pub tenant_shard_id: TenantShardId, + pub healthy: bool, + pub last_scrubbed_at: chrono::DateTime, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthUpdateRequest { + pub healthy_tenant_shards: HashSet, + pub unhealthy_tenant_shards: HashSet, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthUpdateResponse {} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthListUnhealthyResponse { + pub unhealthy_tenant_shards: Vec, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthListOutdatedRequest { + #[serde(with = "humantime_serde")] + pub not_scrubbed_for: Duration, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthListOutdatedResponse { + pub health_records: Vec, +} + #[cfg(test)] mod test { use super::*; @@ -280,4 +347,19 @@ mod test { assert_eq!(serde_json::from_str::(&encoded)?, v); Ok(()) } + + #[test] + fn test_reject_unknown_field() { + let id = TenantId::generate(); + let create_request = serde_json::json!({ + "new_tenant_id": id.to_string(), + "unknown_field": "unknown_value".to_string(), + }); + let err = serde_json::from_value::(create_request).unwrap_err(); + assert!( + err.to_string().contains("unknown field `unknown_field`"), + "expect unknown field `unknown_field` error, got: {}", + err + ); + } } diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 997c1cc43a..2fdd7de38f 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -22,6 +22,11 @@ pub struct Key { pub field6: u32, } +/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as +/// a struct of fields. +#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)] +pub struct CompactKey(i128); + /// The storage key size. pub const KEY_SIZE: usize = 18; @@ -29,7 +34,7 @@ pub const KEY_SIZE: usize = 18; /// See [`Key::to_i128`] for more information on the encoding. pub const METADATA_KEY_SIZE: usize = 16; -/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key. +/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key. pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60; pub const METADATA_KEY_END_PREFIX: u8 = 0x7F; @@ -107,7 +112,10 @@ impl Key { /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 pub fn to_i128(&self) -> i128 { - assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); + assert!( + self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222, + "invalid key: {self}", + ); (((self.field1 & 0x7F) as i128) << 120) | (((self.field2 & 0xFFFF) as i128) << 104) | ((self.field3 as i128) << 72) @@ -127,6 +135,14 @@ impl Key { } } + pub fn to_compact(&self) -> CompactKey { + CompactKey(self.to_i128()) + } + + pub fn from_compact(k: CompactKey) -> Self { + Self::from_i128(k.0) + } + pub const fn next(&self) -> Key { self.add(1) } @@ -160,8 +176,9 @@ impl Key { key } - /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently. - /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys). + /// Convert a 18B slice to a key. This function should not be used for 16B metadata keys because `field2` is handled differently. + /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys). There are some restrictions on `field2`, + /// and therefore not all 18B slices are valid page server keys. pub fn from_slice(b: &[u8]) -> Self { Key { field1: b[0], @@ -173,7 +190,7 @@ impl Key { } } - /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently. + /// Convert a key to a 18B slice. This function should not be used for getting a 16B metadata key because `field2` is handled differently. /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys). pub fn write_to_byte_slice(&self, buf: &mut [u8]) { buf[0] = self.field1; @@ -195,6 +212,13 @@ impl fmt::Display for Key { } } +impl fmt::Display for CompactKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let k = Key::from_compact(*self); + k.fmt(f) + } +} + impl Key { pub const MIN: Key = Key { field1: u8::MIN, diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 9a61f2ad81..401887d362 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -17,6 +17,16 @@ pub struct KeySpace { pub ranges: Vec>, } +impl std::fmt::Display for KeySpace { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + for range in &self.ranges { + write!(f, "{}..{},", range.start, range.end)?; + } + write!(f, "]") + } +} + /// A wrapper type for sparse keyspaces. #[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct SparseKeySpace(pub KeySpace); diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 70db0b7344..ab4adfbebe 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -5,10 +5,10 @@ pub mod utilization; pub use utilization::PageserverUtilization; use std::{ - borrow::Cow, collections::HashMap, io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, + str::FromStr, sync::atomic::AtomicUsize, time::{Duration, SystemTime}, }; @@ -19,13 +19,11 @@ use serde::{Deserialize, Serialize}; use serde_with::serde_as; use utils::{ completion, - history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, lsn::Lsn, serde_system_time, }; -use crate::controller_api::PlacementPolicy; use crate::{ reltag::RelTag, shard::{ShardCount, ShardStripeSize, TenantShardId}, @@ -229,6 +227,11 @@ pub struct TimelineCreateRequest { pub pg_version: Option, } +#[derive(Serialize, Deserialize, Clone)] +pub struct LsnLeaseRequest { + pub lsn: Lsn, +} + #[derive(Serialize, Deserialize)] pub struct TenantShardSplitRequest { pub new_shard_count: u8, @@ -271,44 +274,6 @@ impl Default for ShardParameters { } } -#[derive(Serialize, Deserialize, Debug)] -#[serde(deny_unknown_fields)] -pub struct TenantCreateRequest { - pub new_tenant_id: TenantShardId, - #[serde(default)] - #[serde(skip_serializing_if = "Option::is_none")] - pub generation: Option, - - // If omitted, create a single shard with TenantShardId::unsharded() - #[serde(default)] - #[serde(skip_serializing_if = "ShardParameters::is_unsharded")] - pub shard_parameters: ShardParameters, - - // This parameter is only meaningful in requests sent to the storage controller - #[serde(default)] - #[serde(skip_serializing_if = "Option::is_none")] - pub placement_policy: Option, - - #[serde(flatten)] - pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it -} - -#[derive(Deserialize, Debug)] -#[serde(deny_unknown_fields)] -pub struct TenantLoadRequest { - #[serde(default)] - #[serde(skip_serializing_if = "Option::is_none")] - pub generation: Option, -} - -impl std::ops::Deref for TenantCreateRequest { - type Target = TenantConfig; - - fn deref(&self) -> &Self::Target { - &self.config - } -} - /// An alternative representation of `pageserver::tenant::TenantConf` with /// simpler types. #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] @@ -327,7 +292,6 @@ pub struct TenantConfig { pub walreceiver_connect_timeout: Option, pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, - pub trace_read_requests: Option, pub eviction_policy: Option, pub min_resident_size_override: Option, pub evictions_low_residence_duration_metric_threshold: Option, @@ -471,6 +435,41 @@ pub enum CompactionAlgorithm { Tiered, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum ImageCompressionAlgorithm { + // Disabled for writes, support decompressing during read path + Disabled, + /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well. + /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html). + Zstd { + level: Option, + }, +} + +impl FromStr for ImageCompressionAlgorithm { + type Err = anyhow::Error; + fn from_str(s: &str) -> Result { + let mut components = s.split(['(', ')']); + let first = components + .next() + .ok_or_else(|| anyhow::anyhow!("empty string"))?; + match first { + "disabled" => Ok(ImageCompressionAlgorithm::Disabled), + "zstd" => { + let level = if let Some(v) = components.next() { + let v: i8 = v.parse()?; + Some(v) + } else { + None + }; + + Ok(ImageCompressionAlgorithm::Zstd { level }) + } + _ => anyhow::bail!("invalid specifier '{first}'"), + } + } +} + #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)] pub struct CompactionAlgorithmSettings { pub kind: CompactionAlgorithm, @@ -563,10 +562,6 @@ pub struct LocationConfigListResponse { pub tenant_shards: Vec<(TenantShardId, Option)>, } -#[derive(Serialize, Deserialize)] -#[serde(transparent)] -pub struct TenantCreateResponse(pub TenantId); - #[derive(Serialize)] pub struct StatusResponse { pub id: NodeId, @@ -623,31 +618,6 @@ impl TenantConfigRequest { } } -#[derive(Debug, Deserialize)] -pub struct TenantAttachRequest { - #[serde(default)] - pub config: TenantAttachConfig, - #[serde(default)] - pub generation: Option, -} - -/// Newtype to enforce deny_unknown_fields on TenantConfig for -/// its usage inside `TenantAttachRequest`. -#[derive(Debug, Serialize, Deserialize, Default)] -#[serde(deny_unknown_fields)] -pub struct TenantAttachConfig { - #[serde(flatten)] - allowing_unknown_fields: TenantConfig, -} - -impl std::ops::Deref for TenantAttachConfig { - type Target = TenantConfig; - - fn deref(&self) -> &Self::Target { - &self.allowing_unknown_fields - } -} - /// See [`TenantState::attachment_status`] and the OpenAPI docs for context. #[derive(Serialize, Deserialize, Clone)] #[serde(tag = "slug", content = "data", rename_all = "snake_case")] @@ -666,8 +636,14 @@ pub struct TenantInfo { /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub attachment_status: TenantAttachmentStatus, + pub generation: u32, + + /// Opaque explanation if gc is being blocked. + /// + /// Only looked up for the individual tenant detail, not the listing. This is purely for + /// debugging, not included in openapi. #[serde(skip_serializing_if = "Option::is_none")] - pub generation: Option, + pub gc_blocking: Option, } #[derive(Serialize, Deserialize, Clone)] @@ -680,6 +656,17 @@ pub struct TenantDetails { pub timelines: Vec, } +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)] +pub enum TimelineArchivalState { + Archived, + Unarchived, +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] +pub struct TimelineArchivalConfigRequest { + pub state: TimelineArchivalState, +} + /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { @@ -712,6 +699,16 @@ pub struct TimelineInfo { pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, + /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes + /// beyond the branch's branch point, we only count up to the branch point. + pub pitr_history_size: u64, + + /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any + /// ancestor data used by this branch would have been retained anyway). If this is false, then + /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would + /// otherwise be able to GC. + pub within_ancestor_pitr: bool, + pub timeline_dir_layer_file_size_sum: Option, pub wal_source_connstr: Option, @@ -734,58 +731,7 @@ pub struct LayerMapInfo { pub historic_layers: Vec, } -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)] -#[repr(usize)] -pub enum LayerAccessKind { - GetValueReconstructData, - Iter, - KeyIter, - Dump, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LayerAccessStatFullDetails { - pub when_millis_since_epoch: u64, - pub task_kind: Cow<'static, str>, - pub access_kind: LayerAccessKind, -} - -/// An event that impacts the layer's residence status. -#[serde_as] -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LayerResidenceEvent { - /// The time when the event occurred. - /// NB: this timestamp is captured while the residence status changes. - /// So, it might be behind/ahead of the actual residence change by a short amount of time. - /// - #[serde(rename = "timestamp_millis_since_epoch")] - #[serde_as(as = "serde_with::TimestampMilliSeconds")] - pub timestamp: SystemTime, - /// The new residence status of the layer. - pub status: LayerResidenceStatus, - /// The reason why we had to record this event. - pub reason: LayerResidenceEventReason, -} - -/// The reason for recording a given [`LayerResidenceEvent`]. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub enum LayerResidenceEventReason { - /// The layer map is being populated, e.g. during timeline load or attach. - /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`]. - /// We need to record such events because there is no persistent storage for the events. - /// - // https://github.com/rust-lang/rust/issues/74481 - /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html - /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote - LayerLoad, - /// We just created the layer (e.g., freeze_and_flush or compaction). - /// Such layers are always [`LayerResidenceStatus::Resident`]. - LayerCreate, - /// We on-demand downloaded or evicted the given layer. - ResidenceChange, -} - -/// The residence status of the layer, after the given [`LayerResidenceEvent`]. +/// The residence status of a layer #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub enum LayerResidenceStatus { /// Residence status for a layer file that exists locally. @@ -795,23 +741,16 @@ pub enum LayerResidenceStatus { Evicted, } -impl LayerResidenceEvent { - pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self { - Self { - status, - reason, - timestamp: SystemTime::now(), - } - } -} - +#[serde_as] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerAccessStats { - pub access_count_by_access_kind: HashMap, - pub task_kind_access_flag: Vec>, - pub first: Option, - pub accesses_history: HistoryBufferWithDropCounter, - pub residence_events_history: HistoryBufferWithDropCounter, + #[serde_as(as = "serde_with::TimestampMilliSeconds")] + pub access_time: SystemTime, + + #[serde_as(as = "serde_with::TimestampMilliSeconds")] + pub residence_time: SystemTime, + + pub visible: bool, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -1008,6 +947,8 @@ pub struct TopTenantShardsResponse { } pub mod virtual_file { + use std::path::PathBuf; + #[derive( Copy, Clone, @@ -1026,6 +967,53 @@ pub mod virtual_file { #[cfg(target_os = "linux")] TokioEpollUring, } + + /// Direct IO modes for a pageserver. + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] + pub enum DirectIoMode { + /// Direct IO disabled (uses usual buffered IO). + #[default] + Disabled, + /// Direct IO disabled (performs checks and perf simulations). + Evaluate { + /// Alignment check level + alignment_check: DirectIoAlignmentCheckLevel, + /// Latency padded for performance simulation. + latency_padding: DirectIoLatencyPadding, + }, + /// Direct IO enabled. + Enabled { + /// Actions to perform on alignment error. + on_alignment_error: DirectIoOnAlignmentErrorAction, + }, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(rename_all = "kebab-case")] + pub enum DirectIoAlignmentCheckLevel { + #[default] + Error, + Log, + None, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(rename_all = "kebab-case")] + pub enum DirectIoOnAlignmentErrorAction { + Error, + #[default] + FallbackToBuffered, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(tag = "type", rename_all = "kebab-case")] + pub enum DirectIoLatencyPadding { + /// Pad virtual file operations with IO to a fake file. + FakeFileRW { path: PathBuf }, + #[default] + None, + } } // Wrapped in libpq CopyData @@ -1494,7 +1482,8 @@ mod tests { state: TenantState::Active, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, - generation: None, + generation: 1, + gc_blocking: None, }; let expected_active = json!({ "id": original_active.id.to_string(), @@ -1504,7 +1493,8 @@ mod tests { "current_physical_size": 42, "attachment_status": { "slug":"attached", - } + }, + "generation" : 1 }); let original_broken = TenantInfo { @@ -1515,7 +1505,8 @@ mod tests { }, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, - generation: None, + generation: 1, + gc_blocking: None, }; let expected_broken = json!({ "id": original_broken.id.to_string(), @@ -1529,7 +1520,8 @@ mod tests { "current_physical_size": 42, "attachment_status": { "slug":"attached", - } + }, + "generation" : 1 }); assert_eq!( @@ -1547,18 +1539,6 @@ mod tests { #[test] fn test_reject_unknown_field() { - let id = TenantId::generate(); - let create_request = json!({ - "new_tenant_id": id.to_string(), - "unknown_field": "unknown_value".to_string(), - }); - let err = serde_json::from_value::(create_request).unwrap_err(); - assert!( - err.to_string().contains("unknown field `unknown_field`"), - "expect unknown field `unknown_field` error, got: {}", - err - ); - let id = TenantId::generate(); let config_request = json!({ "tenant_id": id.to_string(), @@ -1570,18 +1550,6 @@ mod tests { "expect unknown field `unknown_field` error, got: {}", err ); - - let attach_request = json!({ - "config": { - "unknown_field": "unknown_value".to_string(), - }, - }); - let err = serde_json::from_value::(attach_request).unwrap_err(); - assert!( - err.to_string().contains("unknown field `unknown_field`"), - "expect unknown field `unknown_field` error, got: {}", - err - ); } #[test] @@ -1705,4 +1673,25 @@ mod tests { AuxFilePolicy::CrossValidation ); } + + #[test] + fn test_image_compression_algorithm_parsing() { + use ImageCompressionAlgorithm::*; + assert_eq!( + ImageCompressionAlgorithm::from_str("disabled").unwrap(), + Disabled + ); + assert_eq!( + ImageCompressionAlgorithm::from_str("zstd").unwrap(), + Zstd { level: None } + ); + assert_eq!( + ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(), + Zstd { level: Some(18) } + ); + assert_eq!( + ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(), + Zstd { level: Some(-3) } + ); + } } diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs index fc1f10e734..ad74d343ae 100644 --- a/libs/pageserver_api/src/models/detach_ancestor.rs +++ b/libs/pageserver_api/src/models/detach_ancestor.rs @@ -1,6 +1,8 @@ +use std::collections::HashSet; + use utils::id::TimelineId; -#[derive(Default, serde::Serialize)] +#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)] pub struct AncestorDetached { - pub reparented_timelines: Vec, + pub reparented_timelines: HashSet, } diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index e88cab5d6a..0fec221276 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,4 +1,5 @@ -use utils::serde_system_time::SystemTime; +use std::time::SystemTime; +use utils::{serde_percent::Percent, serde_system_time}; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -9,19 +10,88 @@ use utils::serde_system_time::SystemTime; /// not handle full u64 values properly. #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)] pub struct PageserverUtilization { - /// Used disk space + /// Used disk space (physical, ground truth from statfs()) #[serde(serialize_with = "ser_saturating_u63")] pub disk_usage_bytes: u64, /// Free disk space #[serde(serialize_with = "ser_saturating_u63")] pub free_space_bytes: u64, - /// Lower is better score for how good candidate for a next tenant would this pageserver be. - #[serde(serialize_with = "ser_saturating_u63")] + + /// Wanted disk space, based on the tenant shards currently present on this pageserver: this + /// is like disk_usage_bytes, but it is stable and does not change with the cache state of + /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay + /// there, or may be unrealistically low if the pageserver has attached tenants which haven't + /// downloaded layers yet. + #[serde(serialize_with = "ser_saturating_u63", default)] + pub disk_wanted_bytes: u64, + + // What proportion of total disk space will this pageserver use before it starts evicting data? + #[serde(default = "unity_percent")] + pub disk_usable_pct: Percent, + + // How many shards are currently on this node? + #[serde(default)] + pub shard_count: u32, + + // How many shards should this node be able to handle at most? + #[serde(default)] + pub max_shard_count: u32, + + /// Cached result of [`Self::score`] pub utilization_score: u64, + /// When was this snapshot captured, pageserver local time. /// /// Use millis to give confidence that the value is regenerated often enough. - pub captured_at: SystemTime, + pub captured_at: serde_system_time::SystemTime, +} + +fn unity_percent() -> Percent { + Percent::new(0).unwrap() +} + +impl PageserverUtilization { + const UTILIZATION_FULL: u64 = 1000000; + + /// Calculate a utilization score. The result is to be inrepreted as a fraction of + /// Self::UTILIZATION_FULL. + /// + /// Lower values are more affine to scheduling more work on this node. + /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work. + /// - 0.0 represents an empty node. + /// - Negative values are forbidden + /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to + /// layer eviction. + pub fn score(&self) -> u64 { + let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes) + * self.disk_usable_pct.get() as u64) + / 100; + let disk_utilization_score = + self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity; + + let shard_utilization_score = + self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64; + std::cmp::max(disk_utilization_score, shard_utilization_score) + } + + pub fn refresh_score(&mut self) { + self.utilization_score = self.score(); + } + + /// A utilization structure that has a full utilization score: use this as a placeholder when + /// you need a utilization but don't have real values yet. + pub fn full() -> Self { + Self { + disk_usage_bytes: 1, + free_space_bytes: 0, + disk_wanted_bytes: 1, + disk_usable_pct: Percent::new(100).unwrap(), + shard_count: 1, + max_shard_count: 1, + utilization_score: Self::UTILIZATION_FULL, + captured_at: serde_system_time::SystemTime(SystemTime::now()), + } + } } /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. @@ -49,15 +119,19 @@ mod tests { let doc = PageserverUtilization { disk_usage_bytes: u64::MAX, free_space_bytes: 0, - utilization_score: u64::MAX, - captured_at: SystemTime( + disk_wanted_bytes: u64::MAX, + utilization_score: 13, + disk_usable_pct: Percent::new(90).unwrap(), + shard_count: 100, + max_shard_count: 200, + captured_at: serde_system_time::SystemTime( std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), ), }; let s = serde_json::to_string(&doc).unwrap(); - let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#; + let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}"; assert_eq!(s, expected); } diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 8c5a4e6168..e83cf4c855 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -1,59 +1,42 @@ -use std::{ops::RangeInclusive, str::FromStr}; +//! See docs/rfcs/031-sharding-static.md for an overview of sharding. +//! +//! This module contains a variety of types used to represent the concept of sharding +//! a Neon tenant across multiple physical shards. Since there are quite a few of these, +//! we provide an summary here. +//! +//! Types used to describe shards: +//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value +//! which identifies a tenant which is not shard-aware. This means its storage paths do not include +//! a shard suffix. +//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant. +//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId` +//! without the tenant ID. This is useful for things that are implicitly scoped to a particular +//! tenant, such as layer files. +//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient +//! detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read. +//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as +//! four hex digits. An unsharded tenant is `0000`. +//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant +//! +//! Types used to describe the parameters for data distribution in a sharded tenant: +//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across +//! multiple shards. Its value is given in 8kiB pages. +//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is +//! always zero: this is provided for future upgrades that might introduce different +//! data distribution schemes. +//! +//! Examples: +//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000 +//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001 +//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), +//! and their slugs are 0004, 0104, 0204, and 0304. use crate::{key::Key, models::ShardParameters}; -use hex::FromHex; use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; -use utils::id::TenantId; -/// See docs/rfcs/031-sharding-static.md for an overview of sharding. -/// -/// This module contains a variety of types used to represent the concept of sharding -/// a Neon tenant across multiple physical shards. Since there are quite a few of these, -/// we provide an summary here. -/// -/// Types used to describe shards: -/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value -/// which identifies a tenant which is not shard-aware. This means its storage paths do not include -/// a shard suffix. -/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant. -/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId` -/// without the tenant ID. This is useful for things that are implicitly scoped to a particular -/// tenant, such as layer files. -/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient -/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read. -/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as -/// four hex digits. An unsharded tenant is `0000`. -/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant -/// -/// Types used to describe the parameters for data distribution in a sharded tenant: -/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across -/// multiple shards. Its value is given in 8kiB pages. -/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is -/// always zero: this is provided for future upgrades that might introduce different -/// data distribution schemes. -/// -/// Examples: -/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000 -/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001 -/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), -/// and their slugs are 0004, 0104, 0204, and 0304. - -#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] -pub struct ShardNumber(pub u8); - -#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] -pub struct ShardCount(u8); - -/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, -/// when we need to know which shard we're dealing with, but do not need to know the full -/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know -/// the fully qualified TenantShardId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct ShardIndex { - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} +#[doc(inline)] +pub use ::utils::shard::*; /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], /// and to check whether that [`ShardNumber`] is the same as the current shard. @@ -65,362 +48,6 @@ pub struct ShardIdentity { layout: ShardLayout, } -/// Formatting helper, for generating the `shard_id` label in traces. -struct ShardSlug<'a>(&'a TenantShardId); - -/// TenantShardId globally identifies a particular shard in a particular tenant. -/// -/// These are written as `-`, for example: -/// # The second shard in a two-shard tenant -/// 072f1291a5310026820b2fe4b2968934-0102 -/// -/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without -/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables -/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`]. -/// -/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs, -/// is both forward and backward compatible with TenantId: a legacy TenantId can be -/// decoded as a TenantShardId, and when re-encoded it will be parseable -/// as a TenantId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct TenantShardId { - pub tenant_id: TenantId, - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - -impl ShardCount { - pub const MAX: Self = Self(u8::MAX); - - /// The internal value of a ShardCount may be zero, which means "1 shard, but use - /// legacy format for TenantShardId that excludes the shard suffix", also known - /// as [`TenantShardId::unsharded`]. - /// - /// This method returns the actual number of shards, i.e. if our internal value is - /// zero, we return 1 (unsharded tenants have 1 shard). - pub fn count(&self) -> u8 { - if self.0 > 0 { - self.0 - } else { - 1 - } - } - - /// The literal internal value: this is **not** the number of shards in the - /// tenant, as we have a special zero value for legacy unsharded tenants. Use - /// [`Self::count`] if you want to know the cardinality of shards. - pub fn literal(&self) -> u8 { - self.0 - } - - /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but - /// uses the legacy format for `TenantShardId`. See also the documentation for - /// [`Self::count`]. - pub fn is_unsharded(&self) -> bool { - self.0 == 0 - } - - /// `v` may be zero, or the number of shards in the tenant. `v` is what - /// [`Self::literal`] would return. - pub const fn new(val: u8) -> Self { - Self(val) - } -} - -impl ShardNumber { - pub const MAX: Self = Self(u8::MAX); -} - -impl TenantShardId { - pub fn unsharded(tenant_id: TenantId) -> Self { - Self { - tenant_id, - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - } - } - - /// The range of all TenantShardId that belong to a particular TenantId. This is useful when - /// you have a BTreeMap of TenantShardId, and are querying by TenantId. - pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive { - RangeInclusive::new( - Self { - tenant_id, - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - }, - Self { - tenant_id, - shard_number: ShardNumber::MAX, - shard_count: ShardCount::MAX, - }, - ) - } - - pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { - ShardSlug(self) - } - - /// Convenience for code that has special behavior on the 0th shard. - pub fn is_shard_zero(&self) -> bool { - self.shard_number == ShardNumber(0) - } - - /// The "unsharded" value is distinct from simply having a single shard: it represents - /// a tenant which is not shard-aware at all, and whose storage paths will not include - /// a shard suffix. - pub fn is_unsharded(&self) -> bool { - self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() - } - - /// Convenience for dropping the tenant_id and just getting the ShardIndex: this - /// is useful when logging from code that is already in a span that includes tenant ID, to - /// keep messages reasonably terse. - pub fn to_index(&self) -> ShardIndex { - ShardIndex { - shard_number: self.shard_number, - shard_count: self.shard_count, - } - } - - /// Calculate the children of this TenantShardId when splitting the overall tenant into - /// the given number of shards. - pub fn split(&self, new_shard_count: ShardCount) -> Vec { - let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1); - let mut child_shards = Vec::new(); - for shard_number in 0..ShardNumber(new_shard_count.0).0 { - // Key mapping is based on a round robin mapping of key hash modulo shard count, - // so our child shards are the ones which the same keys would map to. - if shard_number % effective_old_shard_count == self.shard_number.0 { - child_shards.push(TenantShardId { - tenant_id: self.tenant_id, - shard_number: ShardNumber(shard_number), - shard_count: new_shard_count, - }) - } - } - - child_shards - } -} - -impl<'a> std::fmt::Display for ShardSlug<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{:02x}{:02x}", - self.0.shard_number.0, self.0.shard_count.0 - ) - } -} - -impl std::fmt::Display for TenantShardId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if self.shard_count != ShardCount(0) { - write!(f, "{}-{}", self.tenant_id, self.shard_slug()) - } else { - // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this - // is distinct from the normal single shard case (shard count == 1). - self.tenant_id.fmt(f) - } - } -} - -impl std::fmt::Debug for TenantShardId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // Debug is the same as Display: the compact hex representation - write!(f, "{}", self) - } -} - -impl std::str::FromStr for TenantShardId { - type Err = hex::FromHexError; - - fn from_str(s: &str) -> Result { - // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count - if s.len() == 32 { - // Legacy case: no shard specified - Ok(Self { - tenant_id: TenantId::from_str(s)?, - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - }) - } else if s.len() == 37 { - let bytes = s.as_bytes(); - let tenant_id = TenantId::from_hex(&bytes[0..32])?; - let mut shard_parts: [u8; 2] = [0u8; 2]; - hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?; - Ok(Self { - tenant_id, - shard_number: ShardNumber(shard_parts[0]), - shard_count: ShardCount(shard_parts[1]), - }) - } else { - Err(hex::FromHexError::InvalidStringLength) - } - } -} - -impl From<[u8; 18]> for TenantShardId { - fn from(b: [u8; 18]) -> Self { - let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap(); - - Self { - tenant_id: TenantId::from(tenant_id_bytes), - shard_number: ShardNumber(b[16]), - shard_count: ShardCount(b[17]), - } - } -} - -impl ShardIndex { - pub fn new(number: ShardNumber, count: ShardCount) -> Self { - Self { - shard_number: number, - shard_count: count, - } - } - pub fn unsharded() -> Self { - Self { - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - } - } - - /// The "unsharded" value is distinct from simply having a single shard: it represents - /// a tenant which is not shard-aware at all, and whose storage paths will not include - /// a shard suffix. - pub fn is_unsharded(&self) -> bool { - self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) - } - - /// For use in constructing remote storage paths: concatenate this with a TenantId - /// to get a fully qualified TenantShardId. - /// - /// Backward compat: this function returns an empty string if Self::is_unsharded, such - /// that the legacy pre-sharding remote key format is preserved. - pub fn get_suffix(&self) -> String { - if self.is_unsharded() { - "".to_string() - } else { - format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0) - } - } -} - -impl std::fmt::Display for ShardIndex { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0) - } -} - -impl std::fmt::Debug for ShardIndex { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // Debug is the same as Display: the compact hex representation - write!(f, "{}", self) - } -} - -impl std::str::FromStr for ShardIndex { - type Err = hex::FromHexError; - - fn from_str(s: &str) -> Result { - // Expect format: 1 byte shard number, 1 byte shard count - if s.len() == 4 { - let bytes = s.as_bytes(); - let mut shard_parts: [u8; 2] = [0u8; 2]; - hex::decode_to_slice(bytes, &mut shard_parts)?; - Ok(Self { - shard_number: ShardNumber(shard_parts[0]), - shard_count: ShardCount(shard_parts[1]), - }) - } else { - Err(hex::FromHexError::InvalidStringLength) - } - } -} - -impl From<[u8; 2]> for ShardIndex { - fn from(b: [u8; 2]) -> Self { - Self { - shard_number: ShardNumber(b[0]), - shard_count: ShardCount(b[1]), - } - } -} - -impl Serialize for TenantShardId { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - if serializer.is_human_readable() { - serializer.collect_str(self) - } else { - // Note: while human encoding of [`TenantShardId`] is backward and forward - // compatible, this binary encoding is not. - let mut packed: [u8; 18] = [0; 18]; - packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); - packed[16] = self.shard_number.0; - packed[17] = self.shard_count.0; - - packed.serialize(serializer) - } - } -} - -impl<'de> Deserialize<'de> for TenantShardId { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - struct IdVisitor { - is_human_readable_deserializer: bool, - } - - impl<'de> serde::de::Visitor<'de> for IdVisitor { - type Value = TenantShardId; - - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - if self.is_human_readable_deserializer { - formatter.write_str("value in form of hex string") - } else { - formatter.write_str("value in form of integer array([u8; 18])") - } - } - - fn visit_seq(self, seq: A) -> Result - where - A: serde::de::SeqAccess<'de>, - { - let s = serde::de::value::SeqAccessDeserializer::new(seq); - let id: [u8; 18] = Deserialize::deserialize(s)?; - Ok(TenantShardId::from(id)) - } - - fn visit_str(self, v: &str) -> Result - where - E: serde::de::Error, - { - TenantShardId::from_str(v).map_err(E::custom) - } - } - - if deserializer.is_human_readable() { - deserializer.deserialize_str(IdVisitor { - is_human_readable_deserializer: true, - }) - } else { - deserializer.deserialize_tuple( - 18, - IdVisitor { - is_human_readable_deserializer: false, - }, - ) - } - } -} - /// Stripe size in number of pages #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardStripeSize(pub u32); @@ -585,77 +212,6 @@ impl ShardIdentity { } } -impl Serialize for ShardIndex { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - if serializer.is_human_readable() { - serializer.collect_str(self) - } else { - // Binary encoding is not used in index_part.json, but is included in anticipation of - // switching various structures (e.g. inter-process communication, remote metadata) to more - // compact binary encodings in future. - let mut packed: [u8; 2] = [0; 2]; - packed[0] = self.shard_number.0; - packed[1] = self.shard_count.0; - packed.serialize(serializer) - } - } -} - -impl<'de> Deserialize<'de> for ShardIndex { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - struct IdVisitor { - is_human_readable_deserializer: bool, - } - - impl<'de> serde::de::Visitor<'de> for IdVisitor { - type Value = ShardIndex; - - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - if self.is_human_readable_deserializer { - formatter.write_str("value in form of hex string") - } else { - formatter.write_str("value in form of integer array([u8; 2])") - } - } - - fn visit_seq(self, seq: A) -> Result - where - A: serde::de::SeqAccess<'de>, - { - let s = serde::de::value::SeqAccessDeserializer::new(seq); - let id: [u8; 2] = Deserialize::deserialize(s)?; - Ok(ShardIndex::from(id)) - } - - fn visit_str(self, v: &str) -> Result - where - E: serde::de::Error, - { - ShardIndex::from_str(v).map_err(E::custom) - } - } - - if deserializer.is_human_readable() { - deserializer.deserialize_str(IdVisitor { - is_human_readable_deserializer: true, - }) - } else { - deserializer.deserialize_tuple( - 2, - IdVisitor { - is_human_readable_deserializer: false, - }, - ) - } - } -} - /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys /// in order to be able to serve basebackup requests without peer communication). fn key_is_shard0(key: &Key) -> bool { @@ -737,7 +293,9 @@ pub fn describe( #[cfg(test)] mod tests { - use utils::Hex; + use std::str::FromStr; + + use utils::{id::TenantId, Hex}; use super::*; diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml index 8e249c09f7..f6854328fc 100644 --- a/libs/postgres_backend/Cargo.toml +++ b/libs/postgres_backend/Cargo.toml @@ -13,14 +13,14 @@ rustls.workspace = true serde.workspace = true thiserror.workspace = true tokio.workspace = true +tokio-util.workspace = true tokio-rustls.workspace = true tracing.workspace = true pq_proto.workspace = true -workspace_hack.workspace = true [dev-dependencies] once_cell.workspace = true rustls-pemfile.workspace = true tokio-postgres.workspace = true -tokio-postgres-rustls.workspace = true \ No newline at end of file +tokio-postgres-rustls.workspace = true diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 6c41b7f347..7c7c6535b3 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -16,6 +16,7 @@ use std::{fmt, io}; use std::{future::Future, str::FromStr}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter}; @@ -400,21 +401,15 @@ impl PostgresBackend { } /// Wrapper for run_message_loop() that shuts down socket when we are done - pub async fn run( + pub async fn run( mut self, handler: &mut impl Handler, - shutdown_watcher: F, - ) -> Result<(), QueryError> - where - F: Fn() -> S + Clone, - S: Future, - { - let ret = self - .run_message_loop(handler, shutdown_watcher.clone()) - .await; + cancel: &CancellationToken, + ) -> Result<(), QueryError> { + let ret = self.run_message_loop(handler, cancel).await; tokio::select! { - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // do nothing; we most likely got already stopped by shutdown and will log it next. } _ = self.framed.shutdown() => { @@ -444,21 +439,17 @@ impl PostgresBackend { } } - async fn run_message_loop( + async fn run_message_loop( &mut self, handler: &mut impl Handler, - shutdown_watcher: F, - ) -> Result<(), QueryError> - where - F: Fn() -> S, - S: Future, - { + cancel: &CancellationToken, + ) -> Result<(), QueryError> { trace!("postgres backend to {:?} started", self.peer_addr); tokio::select!( biased; - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received during handshake"); return Err(QueryError::Shutdown) @@ -473,7 +464,7 @@ impl PostgresBackend { let mut query_string = Bytes::new(); while let Some(msg) = tokio::select!( biased; - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received in run_message_loop"); return Err(QueryError::Shutdown) @@ -485,7 +476,7 @@ impl PostgresBackend { let result = self.process_message(handler, msg, &mut query_string).await; tokio::select!( biased; - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received during response flush"); @@ -672,11 +663,17 @@ impl PostgresBackend { assert!(self.state < ProtoState::Authentication); let have_tls = self.tls_config.is_some(); match msg { - FeStartupPacket::SslRequest => { + FeStartupPacket::SslRequest { direct } => { debug!("SSL requested"); - self.write_message(&BeMessage::EncryptionResponse(have_tls)) - .await?; + if !direct { + self.write_message(&BeMessage::EncryptionResponse(have_tls)) + .await?; + } else if !have_tls { + return Err(QueryError::Other(anyhow::anyhow!( + "direct SSL negotiation but no TLS support" + ))); + } if have_tls { self.start_tls().await?; diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 80df9db858..7ec85f0dbe 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -3,13 +3,14 @@ use once_cell::sync::Lazy; use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; use pq_proto::{BeMessage, RowDescriptor}; use std::io::Cursor; -use std::{future, sync::Arc}; +use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio_postgres::config::SslMode; use tokio_postgres::tls::MakeTlsConnect; use tokio_postgres::{Config, NoTls, SimpleQueryMessage}; use tokio_postgres_rustls::MakeRustlsConnect; +use tokio_util::sync::CancellationToken; // generate client, server test streams async fn make_tcp_pair() -> (TcpStream, TcpStream) { @@ -50,7 +51,7 @@ async fn simple_select() { tokio::spawn(async move { let mut handler = TestHandler {}; - pgbackend.run(&mut handler, future::pending::<()>).await + pgbackend.run(&mut handler, &CancellationToken::new()).await }); let conf = Config::new(); @@ -102,7 +103,7 @@ async fn simple_select_ssl() { tokio::spawn(async move { let mut handler = TestHandler {}; - pgbackend.run(&mut handler, future::pending::<()>).await + pgbackend.run(&mut handler, &CancellationToken::new()).await }); let client_cfg = rustls::ClientConfig::builder() diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml index fbfea80ae2..19027d13ff 100644 --- a/libs/postgres_connection/Cargo.toml +++ b/libs/postgres_connection/Cargo.toml @@ -11,7 +11,5 @@ postgres.workspace = true tokio-postgres.workspace = true url.workspace = true -workspace_hack.workspace = true - [dev-dependencies] once_cell.workspace = true diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 86e72f6bdd..ee69878f69 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -19,8 +19,6 @@ thiserror.workspace = true serde.workspace = true utils.workspace = true -workspace_hack.workspace = true - [dev-dependencies] env_logger.workspace = true postgres.workspace = true diff --git a/libs/postgres_ffi/src/controlfile_utils.rs b/libs/postgres_ffi/src/controlfile_utils.rs index 0918d15001..eaa9450294 100644 --- a/libs/postgres_ffi/src/controlfile_utils.rs +++ b/libs/postgres_ffi/src/controlfile_utils.rs @@ -29,7 +29,7 @@ use anyhow::{bail, Result}; use bytes::{Bytes, BytesMut}; /// Equivalent to sizeof(ControlFileData) in C -const SIZEOF_CONTROLDATA: usize = std::mem::size_of::(); +const SIZEOF_CONTROLDATA: usize = size_of::(); impl ControlFileData { /// Compute the offset of the `crc` field within the `ControlFileData` struct. diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 729f57f829..0940ad207f 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -143,8 +143,8 @@ pub use v14::xlog_utils::XLogFileName; pub use v14::bindings::DBState_DB_SHUTDOWNED; -pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result { - dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info))) +pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { + dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info)) } pub fn generate_wal_segment( diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 54b032d138..6ce855c78e 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -31,7 +31,7 @@ pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; // // Assumes 8 byte alignment -const SIZEOF_PAGE_HEADER_DATA: usize = std::mem::size_of::(); +const SIZEOF_PAGE_HEADER_DATA: usize = size_of::(); pub const MAXALIGN_SIZE_OF_PAGE_HEADER_DATA: usize = (SIZEOF_PAGE_HEADER_DATA + 7) & !7; // @@ -191,7 +191,7 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; -pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::() as u32; +pub const SIZEOF_XLOGRECORD: u32 = size_of::() as u32; // // from xlogrecord.h diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 0bbb91afc2..9fe7e8198b 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -42,9 +42,9 @@ pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8; pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2; -pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::(); -pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::(); -pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); +pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = size_of::(); +pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = size_of::(); +pub const XLOG_SIZE_OF_XLOG_RECORD: usize = size_of::(); #[allow(clippy::identity_op)] pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; @@ -311,7 +311,7 @@ impl XLogLongPageHeaderData { } } -pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::(); +pub const SIZEOF_CHECKPOINT: usize = size_of::(); impl CheckPoint { pub fn encode(&self) -> Result { @@ -356,6 +356,28 @@ impl CheckPoint { } false } + + /// Advance next multi-XID/offset to those given in arguments. + /// + /// It's important that this handles wraparound correctly. This should match the + /// MultiXactAdvanceNextMXact() logic in PostgreSQL's xlog_redo() function. + /// + /// Returns 'true' if the Checkpoint was updated. + pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool { + let mut modified = false; + + if multi_xid.wrapping_sub(self.nextMulti) as i32 > 0 { + self.nextMulti = multi_xid; + modified = true; + } + + if multi_offset.wrapping_sub(self.nextMultiOffset) as i32 > 0 { + self.nextMultiOffset = multi_offset; + modified = true; + } + + modified + } } /// Generate new, empty WAL segment, with correct block headers at the first diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 0edc642402..29dd01a936 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -14,8 +14,6 @@ postgres.workspace = true postgres_ffi.workspace = true camino-tempfile.workspace = true -workspace_hack.workspace = true - [dev-dependencies] regex.workspace = true utils.workspace = true diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 496458b2e4..79d45de67a 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -178,7 +178,7 @@ pub fn test_find_end_of_wal_last_crossing_segment() { /// currently 1024. #[test] pub fn test_update_next_xid() { - let checkpoint_buf = [0u8; std::mem::size_of::()]; + let checkpoint_buf = [0u8; size_of::()]; let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); checkpoint.nextXid = FullTransactionId { value: 10 }; @@ -202,6 +202,53 @@ pub fn test_update_next_xid() { assert_eq!(checkpoint.nextXid.value, 2048); } +#[test] +pub fn test_update_next_multixid() { + let checkpoint_buf = [0u8; size_of::()]; + let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); + + // simple case + checkpoint.nextMulti = 20; + checkpoint.nextMultiOffset = 20; + checkpoint.update_next_multixid(1000, 2000); + assert_eq!(checkpoint.nextMulti, 1000); + assert_eq!(checkpoint.nextMultiOffset, 2000); + + // No change + checkpoint.update_next_multixid(500, 900); + assert_eq!(checkpoint.nextMulti, 1000); + assert_eq!(checkpoint.nextMultiOffset, 2000); + + // Close to wraparound, but not wrapped around yet + checkpoint.nextMulti = 0xffff0000; + checkpoint.nextMultiOffset = 0xfffe0000; + checkpoint.update_next_multixid(0xffff00ff, 0xfffe00ff); + assert_eq!(checkpoint.nextMulti, 0xffff00ff); + assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff); + + // Wraparound + checkpoint.update_next_multixid(1, 900); + assert_eq!(checkpoint.nextMulti, 1); + assert_eq!(checkpoint.nextMultiOffset, 900); + + // Wraparound nextMulti to 0. + // + // It's a bit surprising that nextMulti can be 0, because that's a special value + // (InvalidMultiXactId). However, that's how Postgres does it at multi-xid wraparound: + // nextMulti wraps around to 0, but then when the next multi-xid is assigned, it skips + // the 0 and the next multi-xid actually assigned is 1. + checkpoint.nextMulti = 0xffff0000; + checkpoint.nextMultiOffset = 0xfffe0000; + checkpoint.update_next_multixid(0, 0xfffe00ff); + assert_eq!(checkpoint.nextMulti, 0); + assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff); + + // Wraparound nextMultiOffset to 0 + checkpoint.update_next_multixid(0, 0); + assert_eq!(checkpoint.nextMulti, 0); + assert_eq!(checkpoint.nextMultiOffset, 0); +} + #[test] pub fn test_encode_logical_message() { let expected = [ diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index 8afabe670e..66bbe03ebc 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -11,9 +11,7 @@ itertools.workspace = true pin-project-lite.workspace = true postgres-protocol.workspace = true rand.workspace = true -tokio.workspace = true +tokio = { workspace = true, features = ["io-util"] } tracing.workspace = true thiserror.workspace = true serde.workspace = true - -workspace_hack.workspace = true diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs index 6e97b8c2a0..ccbb90e384 100644 --- a/libs/pq_proto/src/framed.rs +++ b/libs/pq_proto/src/framed.rs @@ -44,9 +44,9 @@ impl ConnectionError { /// Wraps async io `stream`, providing messages to write/flush + read Postgres /// messages. pub struct Framed { - stream: S, - read_buf: BytesMut, - write_buf: BytesMut, + pub stream: S, + pub read_buf: BytesMut, + pub write_buf: BytesMut, } impl Framed { diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index cee3742017..a01191bd5d 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -39,14 +39,39 @@ pub enum FeMessage { PasswordMessage(Bytes), } +#[derive(Clone, Copy, PartialEq, PartialOrd)] +pub struct ProtocolVersion(u32); + +impl ProtocolVersion { + pub const fn new(major: u16, minor: u16) -> Self { + Self((major as u32) << 16 | minor as u32) + } + pub const fn minor(self) -> u16 { + self.0 as u16 + } + pub const fn major(self) -> u16 { + (self.0 >> 16) as u16 + } +} + +impl fmt::Debug for ProtocolVersion { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_list() + .entry(&self.major()) + .entry(&self.minor()) + .finish() + } +} + #[derive(Debug)] pub enum FeStartupPacket { CancelRequest(CancelKeyData), - SslRequest, + SslRequest { + direct: bool, + }, GssEncRequest, StartupMessage { - major_version: u32, - minor_version: u32, + version: ProtocolVersion, params: StartupMessageParams, }, } @@ -301,11 +326,23 @@ impl FeStartupPacket { /// different from [`FeMessage::parse`] because startup messages don't have /// message type byte; otherwise, its comments apply. pub fn parse(buf: &mut BytesMut) -> Result, ProtocolError> { + /// const MAX_STARTUP_PACKET_LENGTH: usize = 10000; - const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234; - const CANCEL_REQUEST_CODE: u32 = 5678; - const NEGOTIATE_SSL_CODE: u32 = 5679; - const NEGOTIATE_GSS_CODE: u32 = 5680; + const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234; + /// + const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678); + /// + const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679); + /// + const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680); + + // + // First byte indicates standard SSL handshake message + // (It can't be a Postgres startup length because in network byte order + // that would be a startup packet hundreds of megabytes long) + if buf.first() == Some(&0x16) { + return Ok(Some(FeStartupPacket::SslRequest { direct: true })); + } // need at least 4 bytes with packet len if buf.len() < 4 { @@ -338,12 +375,10 @@ impl FeStartupPacket { let mut msg = buf.split_to(len).freeze(); msg.advance(4); // consume len - let request_code = msg.get_u32(); - let req_hi = request_code >> 16; - let req_lo = request_code & ((1 << 16) - 1); + let request_code = ProtocolVersion(msg.get_u32()); // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code. - let message = match (req_hi, req_lo) { - (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { + let message = match request_code { + CANCEL_REQUEST_CODE => { if msg.remaining() != 8 { return Err(ProtocolError::BadMessage( "CancelRequest message is malformed, backend PID / secret key missing" @@ -355,21 +390,22 @@ impl FeStartupPacket { cancel_key: msg.get_i32(), }) } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { + NEGOTIATE_SSL_CODE => { // Requested upgrade to SSL (aka TLS) - FeStartupPacket::SslRequest + FeStartupPacket::SslRequest { direct: false } } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => { + NEGOTIATE_GSS_CODE => { // Requested upgrade to GSSAPI FeStartupPacket::GssEncRequest } - (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { + version if version.major() == RESERVED_INVALID_MAJOR_VERSION => { return Err(ProtocolError::Protocol(format!( - "Unrecognized request code {unrecognized_code}" + "Unrecognized request code {}", + version.minor() ))); } // TODO bail if protocol major_version is not 3? - (major_version, minor_version) => { + version => { // StartupMessage let s = str::from_utf8(&msg).map_err(|_e| { @@ -382,8 +418,7 @@ impl FeStartupPacket { })?; FeStartupPacket::StartupMessage { - major_version, - minor_version, + version, params: StartupMessageParams { params: msg.slice_ref(s.as_bytes()), }, @@ -522,6 +557,10 @@ pub enum BeMessage<'a> { RowDescription(&'a [RowDescriptor<'a>]), XLogData(XLogDataBody<'a>), NoticeResponse(&'a str), + NegotiateProtocolVersion { + version: ProtocolVersion, + options: &'a [&'a str], + }, KeepAlive(WalSndKeepAlive), } @@ -945,6 +984,18 @@ impl<'a> BeMessage<'a> { buf.put_u8(u8::from(req.request_reply)); }); } + + BeMessage::NegotiateProtocolVersion { version, options } => { + buf.put_u8(b'v'); + write_body(buf, |buf| { + buf.put_u32(version.0); + buf.put_u32(options.len() as u32); + for option in options.iter() { + write_cstr(option, buf)?; + } + Ok(()) + })? + } } Ok(()) } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 78da01c9a0..02adee058f 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true async-trait.workspace = true +async-stream.workspace = true once_cell.workspace = true aws-smithy-async.workspace = true aws-smithy-types.workspace = true @@ -14,8 +15,9 @@ aws-config.workspace = true aws-sdk-s3.workspace = true aws-credential-types.workspace = true bytes.workspace = true -camino.workspace = true +camino = { workspace = true, features = ["serde1"] } humantime.workspace = true +humantime-serde.workspace = true hyper = { workspace = true, features = ["stream"] } futures.workspace = true rand.workspace = true @@ -30,7 +32,7 @@ scopeguard.workspace = true metrics.workspace = true utils.workspace = true pin-project-lite.workspace = true -workspace_hack.workspace = true + azure_core.workspace = true azure_identity.workspace = true azure_storage.workspace = true @@ -44,3 +46,4 @@ sync_wrapper = { workspace = true, features = ["futures"] } camino-tempfile.workspace = true test-context.workspace = true rand.workspace = true +tokio = { workspace = true, features = ["test-util"] } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index dbd64fb5a6..cb7479f6cd 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -15,7 +15,7 @@ use std::time::SystemTime; use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use anyhow::Result; use azure_core::request_options::{MaxResults, Metadata, Range}; -use azure_core::RetryOptions; +use azure_core::{Continuable, RetryOptions}; use azure_identity::DefaultAzureCredential; use azure_storage::StorageCredentials; use azure_storage_blobs::blob::CopyStatus; @@ -33,13 +33,15 @@ use tracing::debug; use utils::backoff; use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; +use crate::ListingObject; use crate::{ - error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, + config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, }; pub struct AzureBlobStorage { client: ContainerClient, + container_name: String, prefix_in_container: Option, max_keys_per_list_response: Option, concurrency_limiter: ConcurrencyLimiter, @@ -85,6 +87,7 @@ impl AzureBlobStorage { Ok(AzureBlobStorage { client, + container_name: azure_config.container_name.to_owned(), prefix_in_container: azure_config.prefix_in_container.to_owned(), max_keys_per_list_response, concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), @@ -238,6 +241,10 @@ impl AzureBlobStorage { _ = cancel.cancelled() => Err(Cancelled), } } + + pub fn container_name(&self) -> &str { + &self.container_name + } } fn to_azure_metadata(metadata: StorageMetadata) -> Metadata { @@ -261,30 +268,30 @@ fn to_download_error(error: azure_core::Error) -> DownloadError { } impl RemoteStorage for AzureBlobStorage { - async fn list( + fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> anyhow::Result { - let _permit = self.permit(RequestKind::List, cancel).await?; + ) -> impl Stream> { + // get the passed prefix or if it is not set use prefix_in_bucket value + let list_prefix = prefix + .map(|p| self.relative_path_to_name(p)) + .or_else(|| self.prefix_in_container.clone()) + .map(|mut p| { + // required to end with a separator + // otherwise request will return only the entry of a prefix + if matches!(mode, ListingMode::WithDelimiter) + && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) + { + p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + } + p + }); - let op = async { - // get the passed prefix or if it is not set use prefix_in_bucket value - let list_prefix = prefix - .map(|p| self.relative_path_to_name(p)) - .or_else(|| self.prefix_in_container.clone()) - .map(|mut p| { - // required to end with a separator - // otherwise request will return only the entry of a prefix - if matches!(mode, ListingMode::WithDelimiter) - && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) - { - p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - } - p - }); + async_stream::stream! { + let _permit = self.permit(RequestKind::List, cancel).await?; let mut builder = self.client.list_blobs(); @@ -300,21 +307,43 @@ impl RemoteStorage for AzureBlobStorage { builder = builder.max_results(MaxResults::new(limit)); } - let response = builder.into_stream(); - let response = response.into_stream().map_err(to_download_error); - let response = tokio_stream::StreamExt::timeout(response, self.timeout); - let response = response.map(|res| match res { - Ok(res) => res, - Err(_elapsed) => Err(DownloadError::Timeout), - }); + let mut next_marker = None; - let mut response = std::pin::pin!(response); + 'outer: loop { + let mut builder = builder.clone(); + if let Some(marker) = next_marker.clone() { + builder = builder.marker(marker); + } + let response = builder.into_stream(); + let response = response.into_stream().map_err(to_download_error); + let response = tokio_stream::StreamExt::timeout(response, self.timeout); + let response = response.map(|res| match res { + Ok(res) => res, + Err(_elapsed) => Err(DownloadError::Timeout), + }); - let mut res = Listing::default(); + let mut response = std::pin::pin!(response); - let mut max_keys = max_keys.map(|mk| mk.get()); - while let Some(entry) = response.next().await { - let entry = entry?; + let mut max_keys = max_keys.map(|mk| mk.get()); + let next_item = tokio::select! { + op = response.next() => Ok(op), + _ = cancel.cancelled() => Err(DownloadError::Cancelled), + }?; + let Some(entry) = next_item else { + // The list is complete, so yield it. + break; + }; + + let mut res = Listing::default(); + let entry = match entry { + Ok(entry) => entry, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue; + } + }; + next_marker = entry.continuation(); let prefix_iter = entry .blobs .prefixes() @@ -324,7 +353,12 @@ impl RemoteStorage for AzureBlobStorage { let blob_iter = entry .blobs .blobs() - .map(|k| self.name_to_relative_path(&k.name)); + .map(|k| ListingObject{ + key: self.name_to_relative_path(&k.name), + last_modified: k.properties.last_modified.into(), + size: k.properties.content_length, + } + ); for key in blob_iter { res.keys.push(key); @@ -333,20 +367,62 @@ impl RemoteStorage for AzureBlobStorage { assert!(mk > 0); mk -= 1; if mk == 0 { - return Ok(res); // limit reached + yield Ok(res); // limit reached + break 'outer; } max_keys = Some(mk); } } - } + yield Ok(res); - Ok(res) + // We are done here + if next_marker.is_none() { + break; + } + } + } + } + + async fn head_object( + &self, + key: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + let kind = RequestKind::Head; + let _permit = self.permit(kind, cancel).await?; + + let started_at = start_measuring_requests(kind); + + let blob_client = self.client.blob_client(self.relative_path_to_name(key)); + let properties_future = blob_client.get_properties().into_future(); + + let properties_future = tokio::time::timeout(self.timeout, properties_future); + + let res = tokio::select! { + res = properties_future => res, + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; - tokio::select! { - res = op => res, - _ = cancel.cancelled() => Err(DownloadError::Cancelled), + if let Ok(inner) = &res { + // do not incl. timeouts as errors in metrics but cancellations + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, inner, started_at); } + + let data = match res { + Ok(Ok(data)) => Ok(data), + Ok(Err(sdk)) => Err(to_download_error(sdk)), + Err(_timeout) => Err(DownloadError::Timeout), + }?; + + let properties = data.blob.properties; + Ok(ListingObject { + key: key.to_owned(), + last_modified: SystemTime::from(properties.last_modified), + size: properties.content_length, + }) } async fn upload( diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs new file mode 100644 index 0000000000..fa3f2cba58 --- /dev/null +++ b/libs/remote_storage/src/config.rs @@ -0,0 +1,264 @@ +use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration}; + +use aws_sdk_s3::types::StorageClass; +use camino::Utf8PathBuf; + +use serde::{Deserialize, Serialize}; + +use crate::{ + DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT, + DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, +}; + +/// External backup storage configuration, enough for creating a client for that storage. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +pub struct RemoteStorageConfig { + /// The storage connection configuration. + #[serde(flatten)] + pub storage: RemoteStorageKind, + /// A common timeout enforced for all requests after concurrency limiter permit has been + /// acquired. + #[serde( + with = "humantime_serde", + default = "default_timeout", + skip_serializing_if = "is_default_timeout" + )] + pub timeout: Duration, +} + +fn default_timeout() -> Duration { + RemoteStorageConfig::DEFAULT_TIMEOUT +} + +fn is_default_timeout(d: &Duration) -> bool { + *d == RemoteStorageConfig::DEFAULT_TIMEOUT +} + +/// A kind of a remote storage to connect to, with its connection configuration. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +#[serde(untagged)] +pub enum RemoteStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored files into. + LocalFs { local_path: Utf8PathBuf }, + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config + AwsS3(S3Config), + /// Azure Blob based storage, storing all files in the container + /// specified by the config + AzureContainer(AzureConfig), +} + +/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). +#[derive(Clone, PartialEq, Eq, Deserialize, Serialize)] +pub struct S3Config { + /// Name of the bucket to connect to. + pub bucket_name: String, + /// The region where the bucket is located at. + pub bucket_region: String, + /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once. + pub prefix_in_bucket: Option, + /// A base URL to send S3 requests to. + /// By default, the endpoint is derived from a region name, assuming it's + /// an AWS S3 region name, erroring on wrong region name. + /// Endpoint provides a way to support other S3 flavors and their regions. + /// + /// Example: `http://127.0.0.1:5000` + pub endpoint: Option, + /// AWS S3 has various limits on its API calls, we need not to exceed those. + /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. + #[serde(default = "default_remote_storage_s3_concurrency_limit")] + pub concurrency_limit: NonZeroUsize, + #[serde(default = "default_max_keys_per_list_response")] + pub max_keys_per_list_response: Option, + #[serde( + deserialize_with = "deserialize_storage_class", + serialize_with = "serialize_storage_class", + default + )] + pub upload_storage_class: Option, +} + +fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize { + DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT + .try_into() + .unwrap() +} + +fn default_max_keys_per_list_response() -> Option { + DEFAULT_MAX_KEYS_PER_LIST_RESPONSE +} + +impl Debug for S3Config { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("S3Config") + .field("bucket_name", &self.bucket_name) + .field("bucket_region", &self.bucket_region) + .field("prefix_in_bucket", &self.prefix_in_bucket) + .field("concurrency_limit", &self.concurrency_limit) + .field( + "max_keys_per_list_response", + &self.max_keys_per_list_response, + ) + .finish() + } +} + +/// Azure bucket coordinates and access credentials to manage the bucket contents (read and write). +#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct AzureConfig { + /// Name of the container to connect to. + pub container_name: String, + /// Name of the storage account the container is inside of + pub storage_account: Option, + /// The region where the bucket is located at. + pub container_region: String, + /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once. + pub prefix_in_container: Option, + /// Azure has various limits on its API calls, we need not to exceed those. + /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details. + #[serde(default = "default_remote_storage_azure_concurrency_limit")] + pub concurrency_limit: NonZeroUsize, + #[serde(default = "default_max_keys_per_list_response")] + pub max_keys_per_list_response: Option, +} + +fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize { + NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap() +} + +impl Debug for AzureConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AzureConfig") + .field("bucket_name", &self.container_name) + .field("storage_account", &self.storage_account) + .field("bucket_region", &self.container_region) + .field("prefix_in_container", &self.prefix_in_container) + .field("concurrency_limit", &self.concurrency_limit) + .field( + "max_keys_per_list_response", + &self.max_keys_per_list_response, + ) + .finish() + } +} + +fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>( + deserializer: D, +) -> Result, D::Error> { + Option::::deserialize(deserializer).and_then(|s| { + if let Some(s) = s { + use serde::de::Error; + let storage_class = StorageClass::from_str(&s).expect("infallible"); + #[allow(deprecated)] + if matches!(storage_class, StorageClass::Unknown(_)) { + return Err(D::Error::custom(format!( + "Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", + StorageClass::values() + ))); + } + Ok(Some(storage_class)) + } else { + Ok(None) + } + }) +} + +fn serialize_storage_class( + val: &Option, + serializer: S, +) -> Result { + let val = val.as_ref().map(StorageClass::as_str); + Option::<&str>::serialize(&val, serializer) +} + +impl RemoteStorageConfig { + pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); + + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { + Ok(utils::toml_edit_ext::deserialize_item(toml)?) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(input: &str) -> anyhow::Result { + let toml = input.parse::().unwrap(); + RemoteStorageConfig::from_toml(toml.as_item()) + } + + #[test] + fn parse_localfs_config_with_timeout() { + let input = "local_path = '.' +timeout = '5s'"; + + let config = parse(input).unwrap(); + + assert_eq!( + config, + RemoteStorageConfig { + storage: RemoteStorageKind::LocalFs { + local_path: Utf8PathBuf::from(".") + }, + timeout: Duration::from_secs(5) + } + ); + } + + #[test] + fn test_s3_parsing() { + let toml = "\ + bucket_name = 'foo-bar' + bucket_region = 'eu-central-1' + upload_storage_class = 'INTELLIGENT_TIERING' + timeout = '7s' + "; + + let config = parse(toml).unwrap(); + + assert_eq!( + config, + RemoteStorageConfig { + storage: RemoteStorageKind::AwsS3(S3Config { + bucket_name: "foo-bar".into(), + bucket_region: "eu-central-1".into(), + prefix_in_bucket: None, + endpoint: None, + concurrency_limit: default_remote_storage_s3_concurrency_limit(), + max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + upload_storage_class: Some(StorageClass::IntelligentTiering), + }), + timeout: Duration::from_secs(7) + } + ); + } + + #[test] + fn test_azure_parsing() { + let toml = "\ + container_name = 'foo-bar' + container_region = 'westeurope' + upload_storage_class = 'INTELLIGENT_TIERING' + timeout = '7s' + "; + + let config = parse(toml).unwrap(); + + assert_eq!( + config, + RemoteStorageConfig { + storage: RemoteStorageKind::AzureContainer(AzureConfig { + container_name: "foo-bar".into(), + storage_account: None, + container_region: "westeurope".into(), + prefix_in_container: None, + concurrency_limit: default_remote_storage_azure_concurrency_limit(), + max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + }), + timeout: Duration::from_secs(7) + } + ); + } +} diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs index 66422853e1..5fd0eaabc7 100644 --- a/libs/remote_storage/src/error.rs +++ b/libs/remote_storage/src/error.rs @@ -42,6 +42,10 @@ impl DownloadError { Timeout | Other(_) => false, } } + + pub fn is_cancelled(&self) -> bool { + matches!(self, DownloadError::Cancelled) + } } impl From for DownloadError { diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 72748e156c..cc1d3e0ae4 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -10,6 +10,7 @@ #![deny(clippy::undocumented_unsafe_blocks)] mod azure_blob; +mod config; mod error; mod local_fs; mod metrics; @@ -18,25 +19,17 @@ mod simulate_failures; mod support; use std::{ - collections::HashMap, - fmt::Debug, - num::{NonZeroU32, NonZeroUsize}, - pin::Pin, - str::FromStr, - sync::Arc, - time::{Duration, SystemTime}, + collections::HashMap, fmt::Debug, num::NonZeroU32, pin::Pin, sync::Arc, time::SystemTime, }; -use anyhow::{bail, Context}; -use aws_sdk_s3::types::StorageClass; +use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use bytes::Bytes; -use futures::stream::Stream; +use futures::{stream::Stream, StreamExt}; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; -use toml_edit::Item; use tracing::info; pub use self::{ @@ -45,6 +38,8 @@ pub use self::{ }; use s3_bucket::RequestKind; +pub use crate::config::{AzureConfig, RemoteStorageConfig, RemoteStorageKind, S3Config}; + /// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. pub use azure_core::Etag; @@ -149,15 +144,23 @@ impl RemotePath { /// /// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The /// NoDelimiter mode will only populate `keys`. +#[derive(Copy, Clone)] pub enum ListingMode { WithDelimiter, NoDelimiter, } +#[derive(PartialEq, Eq, Debug, Clone)] +pub struct ListingObject { + pub key: RemotePath, + pub last_modified: SystemTime, + pub size: u64, +} + #[derive(Default)] pub struct Listing { pub prefixes: Vec, - pub keys: Vec, + pub keys: Vec, } /// Storage (potentially remote) API to manage its state. @@ -165,13 +168,18 @@ pub struct Listing { /// providing basic CRUD operations for storage files. #[allow(async_fn_in_trait)] pub trait RemoteStorage: Send + Sync + 'static { - /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2. - /// (see ``) + /// List objects in remote storage, with semantics matching AWS S3's [`ListObjectsV2`]. + /// + /// The stream is guaranteed to return at least one element, even in the case of errors + /// (in that case it's an `Err()`), or an empty `Listing`. + /// + /// The stream is not ending if it returns an error, as long as [`is_permanent`] returns false on the error. + /// The `next` function can be retried, and maybe in a future retry, there will be success. /// /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not /// from the absolute root of the bucket. /// - /// `mode` configures whether to use a delimiter. Without a delimiter all keys + /// `mode` configures whether to use a delimiter. Without a delimiter, all keys /// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are /// returned in `keys` (). @@ -180,13 +188,39 @@ pub trait RemoteStorage: Send + Sync + 'static { /// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure. /// + /// [`ListObjectsV2`]: + /// [`is_permanent`]: DownloadError::is_permanent + fn list_streaming( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> + Send; + async fn list( &self, prefix: Option<&RemotePath>, - _mode: ListingMode, + mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> Result; + ) -> Result { + let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel)); + let mut combined = stream.next().await.expect("At least one item required")?; + while let Some(list) = stream.next().await { + let list = list?; + combined.keys.extend(list.keys.into_iter()); + combined.prefixes.extend_from_slice(&list.prefixes); + } + Ok(combined) + } + + /// Obtain metadata information about an object. + async fn head_object( + &self, + key: &RemotePath, + cancel: &CancellationToken, + ) -> Result; /// Streams the local file contents into remote into the remote storage entry. /// @@ -293,8 +327,8 @@ impl Debug for Download { /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. -#[derive(Clone)] // Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925 +#[derive(Clone)] pub enum GenericRemoteStorage> { LocalFs(LocalFs), AwsS3(Arc), @@ -303,13 +337,14 @@ pub enum GenericRemoteStorage> { } impl GenericRemoteStorage> { + // See [`RemoteStorage::list`]. pub async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> anyhow::Result { + ) -> Result { match self { Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await, Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await, @@ -318,6 +353,37 @@ impl GenericRemoteStorage> { } } + // See [`RemoteStorage::list_streaming`]. + pub fn list_streaming<'a>( + &'a self, + prefix: Option<&'a RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &'a CancellationToken, + ) -> impl Stream> + 'a + Send { + match self { + Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)) + as Pin> + Send>>, + Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), + Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), + Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), + } + } + + // See [`RemoteStorage::head_object`]. + pub async fn head_object( + &self, + key: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + match self { + Self::LocalFs(s) => s.head_object(key, cancel).await, + Self::AwsS3(s) => s.head_object(key, cancel).await, + Self::AzureBlob(s) => s.head_object(key, cancel).await, + Self::Unreliable(s) => s.head_object(key, cancel).await, + } + } + /// See [`RemoteStorage::upload`] pub async fn upload( &self, @@ -448,10 +514,10 @@ impl GenericRemoteStorage> { } impl GenericRemoteStorage { - pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { + pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { let timeout = storage_config.timeout; Ok(match &storage_config.storage { - RemoteStorageKind::LocalFs(path) => { + RemoteStorageKind::LocalFs { local_path: path } => { info!("Using fs root '{path}' as a remote storage"); Self::LocalFs(LocalFs::new(path.clone(), timeout)?) } @@ -463,7 +529,7 @@ impl GenericRemoteStorage { std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "".into()); info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?)) + Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?)) } RemoteStorageKind::AzureContainer(azure_config) => { let storage_account = azure_config @@ -509,6 +575,16 @@ impl GenericRemoteStorage { None => self.download(from, cancel).await, } } + + /// The name of the bucket/container/etc. + pub fn bucket_name(&self) -> Option<&str> { + match self { + Self::LocalFs(_s) => None, + Self::AwsS3(s) => Some(s.bucket_name()), + Self::AzureBlob(s) => Some(s.container_name()), + Self::Unreliable(_s) => None, + } + } } /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. @@ -526,262 +602,6 @@ impl From<[(&str, &str); N]> for StorageMetadata { } } -/// External backup storage configuration, enough for creating a client for that storage. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct RemoteStorageConfig { - /// The storage connection configuration. - pub storage: RemoteStorageKind, - /// A common timeout enforced for all requests after concurrency limiter permit has been - /// acquired. - pub timeout: Duration, -} - -/// A kind of a remote storage to connect to, with its connection configuration. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum RemoteStorageKind { - /// Storage based on local file system. - /// Specify a root folder to place all stored files into. - LocalFs(Utf8PathBuf), - /// AWS S3 based storage, storing all files in the S3 bucket - /// specified by the config - AwsS3(S3Config), - /// Azure Blob based storage, storing all files in the container - /// specified by the config - AzureContainer(AzureConfig), -} - -/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). -#[derive(Clone, PartialEq, Eq)] -pub struct S3Config { - /// Name of the bucket to connect to. - pub bucket_name: String, - /// The region where the bucket is located at. - pub bucket_region: String, - /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once. - pub prefix_in_bucket: Option, - /// A base URL to send S3 requests to. - /// By default, the endpoint is derived from a region name, assuming it's - /// an AWS S3 region name, erroring on wrong region name. - /// Endpoint provides a way to support other S3 flavors and their regions. - /// - /// Example: `http://127.0.0.1:5000` - pub endpoint: Option, - /// AWS S3 has various limits on its API calls, we need not to exceed those. - /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. - pub concurrency_limit: NonZeroUsize, - pub max_keys_per_list_response: Option, - pub upload_storage_class: Option, -} - -impl Debug for S3Config { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("S3Config") - .field("bucket_name", &self.bucket_name) - .field("bucket_region", &self.bucket_region) - .field("prefix_in_bucket", &self.prefix_in_bucket) - .field("concurrency_limit", &self.concurrency_limit) - .field( - "max_keys_per_list_response", - &self.max_keys_per_list_response, - ) - .finish() - } -} - -/// Azure bucket coordinates and access credentials to manage the bucket contents (read and write). -#[derive(Clone, PartialEq, Eq)] -pub struct AzureConfig { - /// Name of the container to connect to. - pub container_name: String, - /// Name of the storage account the container is inside of - pub storage_account: Option, - /// The region where the bucket is located at. - pub container_region: String, - /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once. - pub prefix_in_container: Option, - /// Azure has various limits on its API calls, we need not to exceed those. - /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details. - pub concurrency_limit: NonZeroUsize, - pub max_keys_per_list_response: Option, -} - -impl Debug for AzureConfig { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("AzureConfig") - .field("bucket_name", &self.container_name) - .field("storage_account", &self.storage_account) - .field("bucket_region", &self.container_region) - .field("prefix_in_container", &self.prefix_in_container) - .field("concurrency_limit", &self.concurrency_limit) - .field( - "max_keys_per_list_response", - &self.max_keys_per_list_response, - ) - .finish() - } -} - -impl RemoteStorageConfig { - pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); - - pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result> { - let local_path = toml.get("local_path"); - let bucket_name = toml.get("bucket_name"); - let bucket_region = toml.get("bucket_region"); - let container_name = toml.get("container_name"); - let container_region = toml.get("container_region"); - - let use_azure = container_name.is_some() && container_region.is_some(); - - let default_concurrency_limit = if use_azure { - DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT - } else { - DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT - }; - let concurrency_limit = NonZeroUsize::new( - parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit), - ) - .context("Failed to parse 'concurrency_limit' as a positive integer")?; - - let max_keys_per_list_response = - parse_optional_integer::("max_keys_per_list_response", toml) - .context("Failed to parse 'max_keys_per_list_response' as a positive integer")? - .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE); - - let endpoint = toml - .get("endpoint") - .map(|endpoint| parse_toml_string("endpoint", endpoint)) - .transpose()?; - - let timeout = toml - .get("timeout") - .map(|timeout| { - timeout - .as_str() - .ok_or_else(|| anyhow::Error::msg("timeout was not a string")) - }) - .transpose() - .and_then(|timeout| { - timeout - .map(humantime::parse_duration) - .transpose() - .map_err(anyhow::Error::new) - }) - .context("parse timeout")? - .unwrap_or(Self::DEFAULT_TIMEOUT); - - if timeout < Duration::from_secs(1) { - bail!("timeout was specified as {timeout:?} which is too low"); - } - - let storage = match ( - local_path, - bucket_name, - bucket_region, - container_name, - container_region, - ) { - // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled - (None, None, None, None, None) => return Ok(None), - (_, Some(_), None, ..) => { - bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") - } - (_, None, Some(_), ..) => { - bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") - } - (None, Some(bucket_name), Some(bucket_region), ..) => { - RemoteStorageKind::AwsS3(S3Config { - bucket_name: parse_toml_string("bucket_name", bucket_name)?, - bucket_region: parse_toml_string("bucket_region", bucket_region)?, - prefix_in_bucket: toml - .get("prefix_in_bucket") - .map(|prefix_in_bucket| { - parse_toml_string("prefix_in_bucket", prefix_in_bucket) - }) - .transpose()?, - endpoint, - concurrency_limit, - max_keys_per_list_response, - upload_storage_class: toml - .get("upload_storage_class") - .map(|prefix_in_bucket| -> anyhow::Result<_> { - let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?; - let storage_class = StorageClass::from_str(&s).expect("infallible"); - #[allow(deprecated)] - if matches!(storage_class, StorageClass::Unknown(_)) { - bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values()); - } - Ok(storage_class) - }) - .transpose()?, - }) - } - (_, _, _, Some(_), None) => { - bail!("'container_name' option is mandatory if 'container_region' is given ") - } - (_, _, _, None, Some(_)) => { - bail!("'container_name' option is mandatory if 'container_region' is given ") - } - (None, None, None, Some(container_name), Some(container_region)) => { - RemoteStorageKind::AzureContainer(AzureConfig { - container_name: parse_toml_string("container_name", container_name)?, - storage_account: toml - .get("storage_account") - .map(|storage_account| { - parse_toml_string("storage_account", storage_account) - }) - .transpose()?, - container_region: parse_toml_string("container_region", container_region)?, - prefix_in_container: toml - .get("prefix_in_container") - .map(|prefix_in_container| { - parse_toml_string("prefix_in_container", prefix_in_container) - }) - .transpose()?, - concurrency_limit, - max_keys_per_list_response, - }) - } - (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs( - Utf8PathBuf::from(parse_toml_string("local_path", local_path)?), - ), - (Some(_), Some(_), ..) => { - bail!("'local_path' and 'bucket_name' are mutually exclusive") - } - (Some(_), _, _, Some(_), Some(_)) => { - bail!("local_path and 'container_name' are mutually exclusive") - } - }; - - Ok(Some(RemoteStorageConfig { storage, timeout })) - } -} - -// Helper functions to parse a toml Item -fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> -where - I: TryFrom, - E: std::error::Error + Send + Sync + 'static, -{ - let toml_integer = match item.get(name) { - Some(item) => item - .as_integer() - .with_context(|| format!("configure option {name} is not an integer"))?, - None => return Ok(None), - }; - - I::try_from(toml_integer) - .map(Some) - .with_context(|| format!("configure option {name} is too large")) -} - -fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { - let s = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - Ok(s.to_string()) -} - struct ConcurrencyLimiter { // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. @@ -799,6 +619,7 @@ impl ConcurrencyLimiter { RequestKind::Delete => &self.write, RequestKind::Copy => &self.write, RequestKind::TimeTravel => &self.write, + RequestKind::Head => &self.read, } } @@ -849,24 +670,4 @@ mod tests { let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths"); assert_eq!(err.to_string(), "Path \"/\" is not relative"); } - - #[test] - fn parse_localfs_config_with_timeout() { - let input = "local_path = '.' -timeout = '5s'"; - - let toml = input.parse::().unwrap(); - - let config = RemoteStorageConfig::from_toml(toml.as_item()) - .unwrap() - .expect("it exists"); - - assert_eq!( - config, - RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")), - timeout: Duration::from_secs(5) - } - ); - } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 1f7bcfc982..c3ef18cab1 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken}; use utils::crashsafe::path_with_suffix_extension; use crate::{ - Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel, - REMOTE_STORAGE_PREFIX_SEPARATOR, + Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, TimeTravelError, + TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, }; use super::{RemoteStorage, StorageMetadata}; @@ -331,6 +331,17 @@ impl LocalFs { } impl RemoteStorage for LocalFs { + fn list_streaming( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> { + let listing = self.list(prefix, mode, max_keys, cancel); + futures::stream::once(listing) + } + async fn list( &self, prefix: Option<&RemotePath>, @@ -346,19 +357,29 @@ impl RemoteStorage for LocalFs { .list_recursive(prefix) .await .map_err(DownloadError::Other)?; - let keys = keys + let objects = keys .into_iter() - .filter(|k| { + .filter_map(|k| { let path = k.with_base(&self.storage_root); - !path.is_dir() + if path.is_dir() { + None + } else { + Some(ListingObject { + key: k.clone(), + // LocalFs is just for testing, so just specify a dummy time + last_modified: SystemTime::now(), + size: 0, + }) + } }) .collect(); if let ListingMode::NoDelimiter = mode { - result.keys = keys; + result.keys = objects; } else { let mut prefixes = HashSet::new(); - for key in keys { + for object in objects { + let key = object.key; // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`. let relative_key = if let Some(prefix) = prefix { let mut prefix = prefix.clone(); @@ -387,9 +408,12 @@ impl RemoteStorage for LocalFs { .to_owned(); prefixes.insert(first_part); } else { - result - .keys - .push(RemotePath::from_string(&relative_key).unwrap()); + result.keys.push(ListingObject { + key: RemotePath::from_string(&relative_key).unwrap(), + // LocalFs is just for testing + last_modified: SystemTime::now(), + size: 0, + }); } } result.prefixes = prefixes @@ -421,6 +445,20 @@ impl RemoteStorage for LocalFs { } } + async fn head_object( + &self, + key: &RemotePath, + _cancel: &CancellationToken, + ) -> Result { + let target_file_path = key.with_base(&self.storage_root); + let metadata = file_metadata(&target_file_path).await?; + Ok(ListingObject { + key: key.clone(), + last_modified: metadata.modified()?, + size: metadata.len(), + }) + } + async fn upload( &self, data: impl Stream> + Send + Sync, @@ -939,7 +977,11 @@ mod fs_tests { .await?; assert!(listing.prefixes.is_empty()); assert_eq!( - listing.keys.into_iter().collect::>(), + listing + .keys + .into_iter() + .map(|o| o.key) + .collect::>(), HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()]) ); @@ -964,7 +1006,7 @@ mod fs_tests { ) .await?; assert_eq!( - listing.keys, + listing.keys.into_iter().map(|o| o.key).collect::>(), [RemotePath::from_string("uncle").unwrap()].to_vec() ); assert_eq!( @@ -981,7 +1023,7 @@ mod fs_tests { &cancel, ) .await?; - assert_eq!(listing.keys, [].to_vec()); + assert_eq!(listing.keys, vec![]); assert_eq!( listing.prefixes, [RemotePath::from_string("grandparent").unwrap()].to_vec() @@ -996,7 +1038,7 @@ mod fs_tests { &cancel, ) .await?; - assert_eq!(listing.keys, [].to_vec()); + assert_eq!(listing.keys, vec![]); assert_eq!( listing.prefixes, [RemotePath::from_string("grandparent").unwrap()].to_vec() @@ -1029,7 +1071,7 @@ mod fs_tests { &cancel, ) .await?; - assert_eq!(listing.keys, [].to_vec()); + assert_eq!(listing.keys, vec![]); let mut found_prefixes = listing.prefixes.clone(); found_prefixes.sort(); diff --git a/libs/remote_storage/src/metrics.rs b/libs/remote_storage/src/metrics.rs index bbb51590f3..f1aa4c433b 100644 --- a/libs/remote_storage/src/metrics.rs +++ b/libs/remote_storage/src/metrics.rs @@ -13,6 +13,7 @@ pub(crate) enum RequestKind { List = 3, Copy = 4, TimeTravel = 5, + Head = 6, } use scopeguard::ScopeGuard; @@ -27,6 +28,7 @@ impl RequestKind { List => "list_objects", Copy => "copy_object", TimeTravel => "time_travel_recover", + Head => "head_object", } } const fn as_index(&self) -> usize { @@ -34,7 +36,8 @@ impl RequestKind { } } -pub(crate) struct RequestTyped([C; 6]); +const REQUEST_KIND_COUNT: usize = 7; +pub(crate) struct RequestTyped([C; REQUEST_KIND_COUNT]); impl RequestTyped { pub(crate) fn get(&self, kind: RequestKind) -> &C { @@ -43,8 +46,8 @@ impl RequestTyped { fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self { use RequestKind::*; - let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter(); - let arr = std::array::from_fn::(|index| { + let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter(); + let arr = std::array::from_fn::(|index| { let next = it.next().unwrap(); assert_eq!(index, next.as_index()); f(next) diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 76cf3eac80..11f6598cbf 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -16,20 +16,14 @@ use std::{ use anyhow::{anyhow, Context as _}; use aws_config::{ - environment::credentials::EnvironmentVariableCredentialsProvider, - imds::credentials::ImdsCredentialsProvider, - meta::credentials::CredentialsProviderChain, - profile::ProfileFileCredentialsProvider, - provider_config::ProviderConfig, + default_provider::credentials::DefaultCredentialsChain, retry::{RetryConfigBuilder, RetryMode}, - web_identity_token::WebIdentityTokenCredentialsProvider, BehaviorVersion, }; -use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::{ config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, - operation::get_object::GetObjectError, + operation::{get_object::GetObjectError, head_object::HeadObjectError}, types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, Client, }; @@ -46,11 +40,12 @@ use utils::backoff; use super::StorageMetadata; use crate::{ + config::S3Config, error::Cancelled, metrics::{start_counting_cancelled_wait, start_measuring_requests}, support::PermitCarrying, - ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, - S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, + ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, + RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, }; @@ -76,40 +71,27 @@ struct GetObjectRequest { } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { + pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { tracing::debug!( "Creating s3 remote storage for S3 bucket {}", remote_storage_config.bucket_name ); - let region = Some(Region::new(remote_storage_config.bucket_region.clone())); + let region = Region::new(remote_storage_config.bucket_region.clone()); + let region_opt = Some(region.clone()); - let provider_conf = ProviderConfig::without_region().with_region(region.clone()); - - let credentials_provider = { - // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - CredentialsProviderChain::first_try( - "env", - EnvironmentVariableCredentialsProvider::new(), - ) - // uses "AWS_PROFILE" / `aws sso login --profile ` - .or_else( - "profile-sso", - ProfileFileCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" - // needed to access remote extensions bucket - .or_else( - "token", - WebIdentityTokenCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses imds v2 - .or_else("imds", ImdsCredentialsProvider::builder().build()) - }; + // https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html + // https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html + // Incomplete list of auth methods used by this: + // * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + // * "AWS_PROFILE" / `aws sso login --profile ` + // * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" + // * http (ECS/EKS) container credentials + // * imds v2 + let credentials_provider = DefaultCredentialsChain::builder() + .region(region) + .build() + .await; // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off let sleep_impl: Arc = Arc::new(TokioSleep::new()); @@ -118,9 +100,9 @@ impl S3Bucket { #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ BehaviorVersion::v2023_11_09(), ) - .region(region) + .region(region_opt) .identity_cache(IdentityCache::lazy().build()) - .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) + .credentials_provider(credentials_provider) .sleep_impl(SharedAsyncSleep::from(sleep_impl)); let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| { @@ -405,6 +387,10 @@ impl S3Bucket { } Ok(()) } + + pub fn bucket_name(&self) -> &str { + &self.bucket_name + } } pin_project_lite::pin_project! { @@ -482,17 +468,16 @@ impl>> Stream for TimedDownload { } impl RemoteStorage for S3Bucket { - async fn list( + fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> Result { + ) -> impl Stream> { let kind = RequestKind::List; // s3 sdk wants i32 let mut max_keys = max_keys.map(|mk| mk.get() as i32); - let mut result = Listing::default(); // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix @@ -504,89 +489,191 @@ impl RemoteStorage for S3Bucket { }) }); + async_stream::stream! { + let _permit = self.permit(kind, cancel).await?; + + let mut continuation_token = None; + 'outer: loop { + let started_at = start_measuring_requests(kind); + + // min of two Options, returning Some if one is value and another is + // None (None is smaller than anything, so plain min doesn't work). + let request_max_keys = self + .max_keys_per_list_response + .into_iter() + .chain(max_keys.into_iter()) + .min(); + let mut request = self + .client + .list_objects_v2() + .bucket(self.bucket_name.clone()) + .set_prefix(list_prefix.clone()) + .set_continuation_token(continuation_token.clone()) + .set_max_keys(request_max_keys); + + if let ListingMode::WithDelimiter = mode { + request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + } + + let request = request.send(); + + let response = tokio::select! { + res = request => Ok(res), + _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout), + _ = cancel.cancelled() => Err(DownloadError::Cancelled), + }?; + + let response = response + .context("Failed to list S3 prefixes") + .map_err(DownloadError::Other); + + let started_at = ScopeGuard::into_inner(started_at); + + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &response, started_at); + + let response = match response { + Ok(response) => response, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue 'outer; + }, + }; + + let keys = response.contents(); + let prefixes = response.common_prefixes.as_deref().unwrap_or_default(); + + tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); + let mut result = Listing::default(); + + for object in keys { + let key = object.key().expect("response does not contain a key"); + let key = self.s3_object_to_relative_path(key); + + let last_modified = match object.last_modified.map(SystemTime::try_from) { + Some(Ok(t)) => t, + Some(Err(_)) => { + tracing::warn!("Remote storage last_modified {:?} for {} is out of bounds", + object.last_modified, key + ); + SystemTime::now() + }, + None => { + SystemTime::now() + } + }; + + let size = object.size.unwrap_or(0) as u64; + + result.keys.push(ListingObject{ + key, + last_modified, + size, + }); + if let Some(mut mk) = max_keys { + assert!(mk > 0); + mk -= 1; + if mk == 0 { + // limit reached + yield Ok(result); + break 'outer; + } + max_keys = Some(mk); + } + } + + // S3 gives us prefixes like "foo/", we return them like "foo" + result.prefixes.extend(prefixes.iter().filter_map(|o| { + Some( + self.s3_object_to_relative_path( + o.prefix()? + .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR), + ), + ) + })); + + yield Ok(result); + + continuation_token = match response.next_continuation_token { + Some(new_token) => Some(new_token), + None => break, + }; + } + } + } + + async fn head_object( + &self, + key: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + let kind = RequestKind::Head; let _permit = self.permit(kind, cancel).await?; - let mut continuation_token = None; + let started_at = start_measuring_requests(kind); - loop { - let started_at = start_measuring_requests(kind); + let head_future = self + .client + .head_object() + .bucket(self.bucket_name()) + .key(self.relative_path_to_s3_object(key)) + .send(); - // min of two Options, returning Some if one is value and another is - // None (None is smaller than anything, so plain min doesn't work). - let request_max_keys = self - .max_keys_per_list_response - .into_iter() - .chain(max_keys.into_iter()) - .min(); - let mut request = self - .client - .list_objects_v2() - .bucket(self.bucket_name.clone()) - .set_prefix(list_prefix.clone()) - .set_continuation_token(continuation_token) - .set_max_keys(request_max_keys); + let head_future = tokio::time::timeout(self.timeout, head_future); - if let ListingMode::WithDelimiter = mode { - request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + let res = tokio::select! { + res = head_future => res, + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; + + let res = res.map_err(|_e| DownloadError::Timeout)?; + + // do not incl. timeouts as errors in metrics but cancellations + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + + let data = match res { + Ok(object_output) => object_output, + Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => { + // Count this in the AttemptOutcome::Ok bucket, because 404 is not + // an error: we expect to sometimes fetch an object and find it missing, + // e.g. when probing for timeline indices. + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Ok, + started_at, + ); + return Err(DownloadError::NotFound); } + Err(e) => { + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Err, + started_at, + ); - let request = request.send(); - - let response = tokio::select! { - res = request => res, - _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout), - _ = cancel.cancelled() => return Err(DownloadError::Cancelled), - }; - - let response = response - .context("Failed to list S3 prefixes") - .map_err(DownloadError::Other); - - let started_at = ScopeGuard::into_inner(started_at); - - crate::metrics::BUCKET_METRICS - .req_seconds - .observe_elapsed(kind, &response, started_at); - - let response = response?; - - let keys = response.contents(); - let empty = Vec::new(); - let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty); - - tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); - - for object in keys { - let object_path = object.key().expect("response does not contain a key"); - let remote_path = self.s3_object_to_relative_path(object_path); - result.keys.push(remote_path); - if let Some(mut mk) = max_keys { - assert!(mk > 0); - mk -= 1; - if mk == 0 { - return Ok(result); // limit reached - } - max_keys = Some(mk); - } + return Err(DownloadError::Other( + anyhow::Error::new(e).context("s3 head object"), + )); } + }; - // S3 gives us prefixes like "foo/", we return them like "foo" - result.prefixes.extend(prefixes.iter().filter_map(|o| { - Some( - self.s3_object_to_relative_path( - o.prefix()? - .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR), - ), - ) - })); - - continuation_token = match response.next_continuation_token { - Some(new_token) => Some(new_token), - None => break, - }; - } - - Ok(result) + let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else { + return Err(DownloadError::Other(anyhow!( + "head_object doesn't contain last_modified or content_length" + )))?; + }; + Ok(ListingObject { + key: key.to_owned(), + last_modified: SystemTime::try_from(last_modified).map_err(|e| { + DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}")) + })?, + size: size as u64, + }) } async fn upload( @@ -1041,8 +1128,8 @@ mod tests { use crate::{RemotePath, S3Bucket, S3Config}; - #[test] - fn relative_path() { + #[tokio::test] + async fn relative_path() { let all_paths = ["", "some/path", "some/path/"]; let all_paths: Vec = all_paths .iter() @@ -1085,8 +1172,9 @@ mod tests { max_keys_per_list_response: Some(5), upload_storage_class: None, }; - let storage = - S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init"); + let storage = S3Bucket::new(&config, std::time::Duration::ZERO) + .await + .expect("remote storage init"); for (test_path_idx, test_path) in all_paths.iter().enumerate() { let result = storage.relative_path_to_s3_object(test_path); let expected = expected_outputs[prefix_idx][test_path_idx]; diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index c467a2d196..c7eb634af3 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -3,6 +3,7 @@ //! testing purposes. use bytes::Bytes; use futures::stream::Stream; +use futures::StreamExt; use std::collections::HashMap; use std::num::NonZeroU32; use std::sync::Mutex; @@ -29,6 +30,7 @@ pub struct UnreliableWrapper { #[derive(Debug, Hash, Eq, PartialEq)] enum RemoteOp { ListPrefixes(Option), + HeadObject(RemotePath), Upload(RemotePath), Download(RemotePath), Delete(RemotePath), @@ -107,6 +109,23 @@ impl UnreliableWrapper { type VoidStorage = crate::LocalFs; impl RemoteStorage for UnreliableWrapper { + fn list_streaming( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> + Send { + async_stream::stream! { + self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) + .map_err(DownloadError::Other)?; + let mut stream = self.inner + .list_streaming(prefix, mode, max_keys, cancel); + while let Some(item) = stream.next().await { + yield item; + } + } + } async fn list( &self, prefix: Option<&RemotePath>, @@ -119,6 +138,16 @@ impl RemoteStorage for UnreliableWrapper { self.inner.list(prefix, mode, max_keys, cancel).await } + async fn head_object( + &self, + key: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + self.attempt(RemoteOp::HeadObject(key.clone())) + .map_err(DownloadError::Other)?; + self.inner.head_object(key, cancel).await + } + async fn upload( &self, data: impl Stream> + Send + Sync + 'static, diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs index da9dc08d8d..daab05d91a 100644 --- a/libs/remote_storage/tests/common/mod.rs +++ b/libs/remote_storage/tests/common/mod.rs @@ -152,7 +152,7 @@ pub(crate) async fn upload_remote_data( let mut upload_tasks = JoinSet::new(); let cancel = CancellationToken::new(); - for i in 1..upload_tasks_count + 1 { + for i in 1..=upload_tasks_count { let task_client = Arc::clone(client); let cancel = cancel.clone(); diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index 673151c8ef..86c55872c1 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,5 +1,6 @@ use anyhow::Context; use camino::Utf8Path; +use futures::StreamExt; use remote_storage::ListingMode; use remote_storage::RemotePath; use std::sync::Arc; @@ -29,10 +30,10 @@ use super::{ /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` /// -/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys. -/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, -/// since current default AWS S3 pagination limit is 1000. -/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax) +/// In the `MaybeEnabledStorageWithTestBlobs::setup`, we set the `max_keys_in_list_response` param to limit the keys in a single response. +/// This way, we are able to test the pagination, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, +/// as the current default AWS S3 pagination limit is 1000. +/// (see ). /// /// Lastly, the test attempts to clean up and remove all uploaded S3 files. /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. @@ -87,6 +88,41 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); + // list_streaming + + let prefix_with_slash = base_prefix.add_trailing_slash(); + let mut nested_remote_prefixes_st = test_client.list_streaming( + Some(&prefix_with_slash), + ListingMode::WithDelimiter, + None, + &cancel, + ); + let mut nested_remote_prefixes_combined = HashSet::new(); + let mut segments = 0; + let mut segment_max_size = 0; + while let Some(st) = nested_remote_prefixes_st.next().await { + let st = st?; + segment_max_size = segment_max_size.max(st.prefixes.len()); + nested_remote_prefixes_combined.extend(st.prefixes.into_iter()); + segments += 1; + } + assert!(segments > 1, "less than 2 segments: {segments}"); + assert!( + segment_max_size * 2 <= nested_remote_prefixes_combined.len(), + "double of segment_max_size={segment_max_size} larger number of remote prefixes of {}", + nested_remote_prefixes_combined.len() + ); + let remote_only_prefixes = nested_remote_prefixes_combined + .difference(&expected_remote_prefixes) + .collect::>(); + let missing_uploaded_prefixes = expected_remote_prefixes + .difference(&nested_remote_prefixes_combined) + .collect::>(); + assert_eq!( + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", + ); + Ok(()) } @@ -120,6 +156,7 @@ async fn list_no_delimiter_works( .context("client list root files failure")? .keys .into_iter() + .map(|o| o.key) .collect::>(); assert_eq!( root_files, @@ -146,6 +183,7 @@ async fn list_no_delimiter_works( .context("client list nested files failure")? .keys .into_iter() + .map(|o| o.key) .collect::>(); let trim_remote_blobs: HashSet<_> = ctx .remote_blobs diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 23628dfebe..3a20649490 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -31,6 +31,7 @@ struct EnabledAzure { impl EnabledAzure { async fn setup(max_keys_in_list_response: Option) -> Self { let client = create_azure_client(max_keys_in_list_response) + .await .context("Azure client creation") .expect("Azure client creation failed"); @@ -187,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { } } -fn create_azure_client( +async fn create_azure_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { use rand::Rng; @@ -221,6 +222,8 @@ fn create_azure_client( timeout: Duration::from_secs(120), }; Ok(Arc::new( - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?, )) } diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index a273abe867..b893beeebd 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -81,6 +81,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: .context("list root files failure")? .keys .into_iter() + .map(|o| o.key) .collect::>(), ) } @@ -197,6 +198,7 @@ struct EnabledS3 { impl EnabledS3 { async fn setup(max_keys_in_list_response: Option) -> Self { let client = create_s3_client(max_keys_in_list_response) + .await .context("S3 client creation") .expect("S3 client creation failed"); @@ -352,7 +354,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { } } -fn create_s3_client( +async fn create_s3_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { use rand::Rng; @@ -385,7 +387,9 @@ fn create_s3_client( timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; Ok(Arc::new( - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?, )) } diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 327d98ee77..e1f4bcca46 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -9,5 +9,3 @@ serde.workspace = true serde_with.workspace = true const_format.workspace = true utils.workspace = true - -workspace_hack.workspace = true diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml index 15e78932a8..8aa3c54f62 100644 --- a/libs/tenant_size_model/Cargo.toml +++ b/libs/tenant_size_model/Cargo.toml @@ -9,5 +9,3 @@ license.workspace = true anyhow.workspace = true serde.workspace = true serde_json.workspace = true - -workspace_hack.workspace = true diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs index f05997ee65..be00562219 100644 --- a/libs/tenant_size_model/src/calculation.rs +++ b/libs/tenant_size_model/src/calculation.rs @@ -34,10 +34,10 @@ struct SegmentSize { } struct SizeAlternatives { - // cheapest alternative if parent is available. + /// cheapest alternative if parent is available. incremental: SegmentSize, - // cheapest alternative if parent node is not available + /// cheapest alternative if parent node is not available non_incremental: Option, } diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs index f26d3aa79d..0de2890bb4 100644 --- a/libs/tenant_size_model/src/svg.rs +++ b/libs/tenant_size_model/src/svg.rs @@ -3,10 +3,17 @@ use std::fmt::Write; const SVG_WIDTH: f32 = 500.0; +/// Different branch kind for SVG drawing. +#[derive(PartialEq)] +pub enum SvgBranchKind { + Timeline, + Lease, +} + struct SvgDraw<'a> { storage: &'a StorageModel, branches: &'a [String], - seg_to_branch: &'a [usize], + seg_to_branch: &'a [(usize, SvgBranchKind)], sizes: &'a [SegmentSizeResult], // layout @@ -42,13 +49,18 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> { "" )?; writeln!(result, "WAL not retained")?; + writeln!( + result, + "" + )?; + writeln!(result, "LSN lease")?; Ok(()) } pub fn draw_svg( storage: &StorageModel, branches: &[String], - seg_to_branch: &[usize], + seg_to_branch: &[(usize, SvgBranchKind)], sizes: &SizeResult, ) -> anyhow::Result { let mut draw = SvgDraw { @@ -100,7 +112,7 @@ impl<'a> SvgDraw<'a> { // Layout the timelines on Y dimension. // TODO - let mut y = 100.0; + let mut y = 120.0; let mut branch_y_coordinates = Vec::new(); for _branch in self.branches { branch_y_coordinates.push(y); @@ -109,7 +121,7 @@ impl<'a> SvgDraw<'a> { // Calculate coordinates for each point let seg_coordinates = std::iter::zip(segments, self.seg_to_branch) - .map(|(seg, branch_id)| { + .map(|(seg, (branch_id, _))| { let x = (seg.lsn - min_lsn) as f32 / xscale; let y = branch_y_coordinates[*branch_id]; (x, y) @@ -175,6 +187,22 @@ impl<'a> SvgDraw<'a> { // draw a snapshot point if it's needed let (coord_x, coord_y) = self.seg_coordinates[seg_id]; + + let (_, kind) = &self.seg_to_branch[seg_id]; + if kind == &SvgBranchKind::Lease { + let (x1, y1) = (coord_x, coord_y - 10.0); + let (x2, y2) = (coord_x, coord_y + 10.0); + + let style = "stroke-width=\"3\" stroke=\"blue\""; + + writeln!( + result, + "", + )?; + writeln!(result, " leased lsn at {}", seg.lsn)?; + writeln!(result, "")?; + } + if self.sizes[seg_id].method == SegmentMethod::SnapshotHere { writeln!( result, diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 512a748124..5ea8db6b42 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -14,5 +14,3 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true - -workspace_hack.workspace = true diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index a6a081c5c1..6e593eeac1 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -20,7 +20,6 @@ bincode.workspace = true bytes.workspace = true camino.workspace = true chrono.workspace = true -heapless.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true hyper = { workspace = true, features = ["full"] } @@ -40,6 +39,7 @@ thiserror.workspace = true tokio.workspace = true tokio-tar.workspace = true tokio-util.workspace = true +toml_edit = { workspace = true, features = ["serde"] } tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } @@ -54,7 +54,6 @@ walkdir.workspace = true pq_proto.workspace = true postgres_connection.workspace = true metrics.workspace = true -workspace_hack.workspace = true const_format.workspace = true @@ -71,6 +70,7 @@ criterion.workspace = true hex-literal.workspace = true camino-tempfile.workspace = true serde_assert.workspace = true +tokio = { workspace = true, features = ["test-util"] } [[bench]] name = "benchmarks" diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 03e65f74fe..7b735875b7 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -18,21 +18,25 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)] #[serde(rename_all = "lowercase")] pub enum Scope { - // Provides access to all data for a specific tenant (specified in `struct Claims` below) + /// Provides access to all data for a specific tenant (specified in `struct Claims` below) // TODO: join these two? Tenant, - // Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs. - // Should only be used e.g. for status check/tenant creation/list. + /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs. + /// Should only be used e.g. for status check/tenant creation/list. PageServerApi, - // Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs. - // Should only be used e.g. for status check. - // Currently also used for connection from any pageserver to any safekeeper. + /// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs. + /// Should only be used e.g. for status check. + /// Currently also used for connection from any pageserver to any safekeeper. SafekeeperData, - // The scope used by pageservers in upcalls to storage controller and cloud control plane + /// The scope used by pageservers in upcalls to storage controller and cloud control plane #[serde(rename = "generations_api")] GenerationsApi, - // Allows access to control plane managment API and some storage controller endpoints. + /// Allows access to control plane managment API and some storage controller endpoints. Admin, + + /// Allows access to storage controller APIs used by the scrubber, to interrogate the state + /// of a tenant & post scrub results. + Scrubber, } /// JWT payload. See docs/authentication.md for the format diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs new file mode 100644 index 0000000000..720ea39d4f --- /dev/null +++ b/libs/utils/src/circuit_breaker.rs @@ -0,0 +1,114 @@ +use std::{ + fmt::Display, + time::{Duration, Instant}, +}; + +use metrics::IntCounter; + +/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly, +/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and +/// to mitigate the log spam from repeated failures. +pub struct CircuitBreaker { + /// An identifier that enables us to log useful errors when a circuit is broken + name: String, + + /// Consecutive failures since last success + fail_count: usize, + + /// How many consecutive failures before we break the circuit + fail_threshold: usize, + + /// If circuit is broken, when was it broken? + broken_at: Option, + + /// If set, we will auto-reset the circuit this long after it was broken. If None, broken + /// circuits stay broken forever, or until success() is called. + reset_period: Option, + + /// If this is true, no actual circuit-breaking happens. This is for overriding a circuit breaker + /// to permit something to keep running even if it would otherwise have tripped it. + short_circuit: bool, +} + +impl CircuitBreaker { + pub fn new(name: String, fail_threshold: usize, reset_period: Option) -> Self { + Self { + name, + fail_count: 0, + fail_threshold, + broken_at: None, + reset_period, + short_circuit: false, + } + } + + /// Construct an unbreakable circuit breaker, for use in unit tests etc. + pub fn short_circuit() -> Self { + Self { + name: String::new(), + fail_threshold: 0, + fail_count: 0, + broken_at: None, + reset_period: None, + short_circuit: true, + } + } + + pub fn fail(&mut self, metric: &IntCounter, error: E) + where + E: Display, + { + if self.short_circuit { + return; + } + + self.fail_count += 1; + if self.broken_at.is_none() && self.fail_count >= self.fail_threshold { + self.break_circuit(metric, error); + } + } + + /// Call this after successfully executing an operation + pub fn success(&mut self, metric: &IntCounter) { + self.fail_count = 0; + if let Some(broken_at) = &self.broken_at { + tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})", + humantime::format_duration(broken_at.elapsed())); + self.broken_at = None; + metric.inc(); + } + } + + /// Call this before attempting an operation, and skip the operation if we are currently broken. + pub fn is_broken(&mut self) -> bool { + if self.short_circuit { + return false; + } + + if let Some(broken_at) = self.broken_at { + match self.reset_period { + Some(reset_period) if broken_at.elapsed() > reset_period => { + self.reset_circuit(); + false + } + _ => true, + } + } else { + false + } + } + + fn break_circuit(&mut self, metric: &IntCounter, error: E) + where + E: Display, + { + self.broken_at = Some(Instant::now()); + tracing::error!(breaker=%self.name, "Circuit breaker broken! Last error: {error}"); + metric.inc(); + } + + fn reset_circuit(&mut self) { + self.broken_at = None; + self.fail_count = 0; + } +} diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index 2fef8d35df..f65c080ad4 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -5,13 +5,40 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; /// Can be cloned, moved and kept around in futures as "guard objects". #[derive(Clone)] pub struct Completion { - _token: TaskTrackerToken, + token: TaskTrackerToken, +} + +impl std::fmt::Debug for Completion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Completion") + .field("siblings", &self.token.task_tracker().len()) + .finish() + } +} + +impl Completion { + /// Returns true if this completion is associated with the given barrier. + pub fn blocks(&self, barrier: &Barrier) -> bool { + TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0) + } + + pub fn barrier(&self) -> Barrier { + Barrier(self.token.task_tracker().clone()) + } } /// Barrier will wait until all clones of [`Completion`] have been dropped. #[derive(Clone)] pub struct Barrier(TaskTracker); +impl std::fmt::Debug for Barrier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Barrier") + .field("remaining", &self.0.len()) + .finish() + } +} + impl Default for Barrier { fn default() -> Self { let (_, rx) = channel(); @@ -51,5 +78,5 @@ pub fn channel() -> (Completion, Barrier) { tracker.close(); let token = tracker.token(); - (Completion { _token: token }, Barrier(tracker)) + (Completion { token }, Barrier(tracker)) } diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index b703e883de..5970836033 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -9,20 +9,11 @@ use serde::{Deserialize, Serialize}; /// numbers are used. #[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] pub enum Generation { - // Generations with this magic value will not add a suffix to S3 keys, and will not - // be included in persisted index_part.json. This value is only to be used - // during migration from pre-generation metadata to generation-aware metadata, - // and should eventually go away. - // - // A special Generation is used rather than always wrapping Generation in an Option, - // so that code handling generations doesn't have to be aware of the legacy - // case everywhere it touches a generation. + // The None Generation is used in the metadata of layers written before generations were + // introduced. A running Tenant always has a valid generation, but the layer metadata may + // include None generations. None, - // Generations with this magic value may never be used to construct S3 keys: - // we will panic if someone tries to. This is for Tenants in the "Broken" state, - // so that we can satisfy their constructor with a Generation without risking - // a code bug using it in an S3 write (broken tenants should never write) - Broken, + Valid(u32), } @@ -42,11 +33,6 @@ impl Generation { Self::None } - // Create a new generation that will panic if you try to use get_suffix - pub fn broken() -> Self { - Self::Broken - } - pub const fn new(v: u32) -> Self { Self::Valid(v) } @@ -60,9 +46,6 @@ impl Generation { match self { Self::Valid(v) => GenerationFileSuffix(Some(*v)), Self::None => GenerationFileSuffix(None), - Self::Broken => { - panic!("Tried to use a broken generation"); - } } } @@ -86,7 +69,6 @@ impl Generation { } } Self::None => Self::None, - Self::Broken => panic!("Attempted to use a broken generation"), } } @@ -95,7 +77,6 @@ impl Generation { match self { Self::Valid(n) => Self::Valid(*n + 1), Self::None => Self::Valid(1), - Self::Broken => panic!("Attempted to use a broken generation"), } } @@ -128,7 +109,7 @@ impl Serialize for Generation { if let Self::Valid(v) = self { v.serialize(serializer) } else { - // We should never be asked to serialize a None or Broken. Structures + // We should never be asked to serialize a None. Structures // that include an optional generation should convert None to an // Option::None Err(serde::ser::Error::custom( @@ -159,9 +140,6 @@ impl Debug for Generation { Self::None => { write!(f, "") } - Self::Broken => { - write!(f, "") - } } } } diff --git a/libs/utils/src/history_buffer.rs b/libs/utils/src/history_buffer.rs deleted file mode 100644 index bd35e2bad6..0000000000 --- a/libs/utils/src/history_buffer.rs +++ /dev/null @@ -1,196 +0,0 @@ -//! A heapless buffer for events of sorts. - -use std::ops; - -use heapless::HistoryBuffer; - -#[derive(Debug, Clone)] -pub struct HistoryBufferWithDropCounter { - buffer: HistoryBuffer, - drop_count: u64, -} - -impl HistoryBufferWithDropCounter { - pub fn write(&mut self, data: T) { - let len_before = self.buffer.len(); - self.buffer.write(data); - let len_after = self.buffer.len(); - self.drop_count += u64::from(len_before == len_after); - } - pub fn drop_count(&self) -> u64 { - self.drop_count - } - pub fn map U>(&self, f: F) -> HistoryBufferWithDropCounter { - let mut buffer = HistoryBuffer::new(); - buffer.extend(self.buffer.oldest_ordered().map(f)); - HistoryBufferWithDropCounter:: { - buffer, - drop_count: self.drop_count, - } - } -} - -impl Default for HistoryBufferWithDropCounter { - fn default() -> Self { - Self { - buffer: HistoryBuffer::default(), - drop_count: 0, - } - } -} - -impl ops::Deref for HistoryBufferWithDropCounter { - type Target = HistoryBuffer; - - fn deref(&self) -> &Self::Target { - &self.buffer - } -} - -#[derive(serde::Serialize, serde::Deserialize)] -struct SerdeRepr { - buffer: Vec, - buffer_size: usize, - drop_count: u64, -} - -impl<'a, T, const L: usize> From<&'a HistoryBufferWithDropCounter> for SerdeRepr -where - T: Clone + serde::Serialize, -{ - fn from(value: &'a HistoryBufferWithDropCounter) -> Self { - let HistoryBufferWithDropCounter { buffer, drop_count } = value; - SerdeRepr { - buffer: buffer.iter().cloned().collect(), - buffer_size: L, - drop_count: *drop_count, - } - } -} - -impl serde::Serialize for HistoryBufferWithDropCounter -where - T: Clone + serde::Serialize, -{ - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - SerdeRepr::from(self).serialize(serializer) - } -} - -impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter -where - T: Clone + serde::Deserialize<'de>, -{ - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - let SerdeRepr { - buffer: des_buffer, - drop_count, - buffer_size, - } = SerdeRepr::::deserialize(deserializer)?; - if buffer_size != L { - use serde::de::Error; - return Err(D::Error::custom(format!( - "invalid buffer_size, expecting {L} got {buffer_size}" - ))); - } - let mut buffer = HistoryBuffer::new(); - buffer.extend(des_buffer); - Ok(HistoryBufferWithDropCounter { buffer, drop_count }) - } -} - -#[cfg(test)] -mod test { - use super::HistoryBufferWithDropCounter; - - #[test] - fn test_basics() { - let mut b = HistoryBufferWithDropCounter::::default(); - b.write(1); - b.write(2); - b.write(3); - assert!(b.iter().any(|e| *e == 2)); - assert!(b.iter().any(|e| *e == 3)); - assert!(!b.iter().any(|e| *e == 1)); - - // round-trip serde - let round_tripped: HistoryBufferWithDropCounter = - serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap(); - assert_eq!( - round_tripped.iter().cloned().collect::>(), - b.iter().cloned().collect::>() - ); - } - - #[test] - fn test_drop_count_works() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - b.write(1); - assert_eq!(b.drop_count(), 0); - b.write(2); - assert_eq!(b.drop_count(), 0); - b.write(3); - assert_eq!(b.drop_count(), 1); - b.write(4); - assert_eq!(b.drop_count(), 2); - } - - #[test] - fn test_clone_works() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - b.write(1); - b.write(2); - b.write(3); - assert_eq!(b.drop_count(), 1); - let mut c = b.clone(); - assert_eq!(c.drop_count(), 1); - assert!(c.iter().any(|e| *e == 2)); - assert!(c.iter().any(|e| *e == 3)); - assert!(!c.iter().any(|e| *e == 1)); - - c.write(4); - assert!(c.iter().any(|e| *e == 4)); - assert!(!b.iter().any(|e| *e == 4)); - } - - #[test] - fn test_map() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - - b.write(1); - assert_eq!(b.drop_count(), 0); - { - let c = b.map(|i| i + 10); - assert_eq!(c.oldest_ordered().cloned().collect::>(), vec![11]); - assert_eq!(c.drop_count(), 0); - } - - b.write(2); - assert_eq!(b.drop_count(), 0); - { - let c = b.map(|i| i + 10); - assert_eq!( - c.oldest_ordered().cloned().collect::>(), - vec![11, 12] - ); - assert_eq!(c.drop_count(), 0); - } - - b.write(3); - assert_eq!(b.drop_count(), 1); - { - let c = b.map(|i| i + 10); - assert_eq!( - c.oldest_ordered().cloned().collect::>(), - vec![12, 13] - ); - assert_eq!(c.drop_count(), 1); - } - } -} diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index f8a5f68131..8ee5abd434 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -52,17 +52,17 @@ struct RequestId(String); /// There could be other ways to implement similar functionality: /// /// * procmacros placed on top of all handler methods -/// With all the drawbacks of procmacros, brings no difference implementation-wise, -/// and little code reduction compared to the existing approach. +/// With all the drawbacks of procmacros, brings no difference implementation-wise, +/// and little code reduction compared to the existing approach. /// /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic, -/// implemented for [`RouterBuilder`]. -/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later. +/// implemented for [`RouterBuilder`]. +/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later. /// /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped -/// later, in a post-response middleware. -/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures` -/// tries to achive with its `.instrument` used in the current approach. +/// later, in a post-response middleware. +/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures` +/// tries to achive with its `.instrument` used in the current approach. /// /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced. pub async fn request_span(request: Request, handler: H) -> R::Output diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 7ca62561fe..6c25440b42 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -8,22 +8,15 @@ use super::error::ApiError; pub async fn json_request Deserialize<'de>>( request: &mut Request, ) -> Result { - json_request_or_empty_body(request) - .await? - .context("missing request body") - .map_err(ApiError::BadRequest) -} - -/// Will be removed as part of -pub async fn json_request_or_empty_body Deserialize<'de>>( - request: &mut Request, -) -> Result, ApiError> { let body = hyper::body::aggregate(request.body_mut()) .await .context("Failed to read request body") .map_err(ApiError::BadRequest)?; + if body.remaining() == 0 { - return Ok(None); + return Err(ApiError::BadRequest(anyhow::anyhow!( + "missing request body" + ))); } let mut deser = serde_json::de::Deserializer::from_reader(body.reader()); @@ -31,7 +24,6 @@ pub async fn json_request_or_empty_body Deserialize<'de>>( serde_path_to_error::deserialize(&mut deser) // intentionally stringify because the debug version is not helpful in python logs .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}")) - .map(Some) .map_err(ApiError::BadRequest) } diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 766bbfc9df..8b8ed5a67f 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -74,6 +74,15 @@ pub fn parse_query_param>( .transpose() } +pub fn must_parse_query_param>( + request: &Request, + param_name: &str, +) -> Result { + parse_query_param(request, param_name)?.ok_or_else(|| { + ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters")) + }) +} + pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> { match request.body_mut().data().await { Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))), diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 0409001f4f..db468e3054 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -302,17 +302,6 @@ pub struct TenantId(Id); id_newtype!(TenantId); -/// Neon Connection Id identifies long-lived connections (for example a pagestream -/// connection with the page_service). Is used for better logging and tracing -/// -/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look -/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// See [`Id`] for alternative ways to serialize it. -#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] -pub struct ConnectionId(Id); - -id_newtype!(ConnectionId); - // A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct TenantTimelineId { diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 2953f0aad4..f4fc0ba57b 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -26,6 +26,8 @@ pub mod auth; // utility functions and helper traits for unified unique id generation/serialization etc. pub mod id; +pub mod shard; + mod hex; pub use hex::Hex; @@ -57,8 +59,6 @@ pub mod signals; pub mod fs_ext; -pub mod history_buffer; - pub mod measured_stream; pub mod serde_percent; @@ -94,6 +94,10 @@ pub mod env; pub mod poison; +pub mod toml_edit_ext; + +pub mod circuit_breaker; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: @@ -124,7 +128,7 @@ pub mod poison; /// /// ############################################################################################# /// TODO this macro is not the way the library is intended to be used, see for details. -/// We use `cachepot` to reduce our current CI build times: +/// We used `cachepot` to reduce our current CI build times: /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation. /// The problem needs further investigation and regular `const` declaration instead of a macro. diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs new file mode 100644 index 0000000000..f6b430657e --- /dev/null +++ b/libs/utils/src/shard.rs @@ -0,0 +1,452 @@ +//! See `pageserver_api::shard` for description on sharding. + +use std::{ops::RangeInclusive, str::FromStr}; + +use hex::FromHex; +use serde::{Deserialize, Serialize}; + +use crate::id::TenantId; + +#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] +pub struct ShardNumber(pub u8); + +#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] +pub struct ShardCount(pub u8); + +/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, +/// when we need to know which shard we're dealing with, but do not need to know the full +/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know +/// the fully qualified TenantShardId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct ShardIndex { + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + +/// Formatting helper, for generating the `shard_id` label in traces. +pub struct ShardSlug<'a>(&'a TenantShardId); + +/// TenantShardId globally identifies a particular shard in a particular tenant. +/// +/// These are written as `-`, for example: +/// # The second shard in a two-shard tenant +/// 072f1291a5310026820b2fe4b2968934-0102 +/// +/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without +/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables +/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`]. +/// +/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs, +/// is both forward and backward compatible with TenantId: a legacy TenantId can be +/// decoded as a TenantShardId, and when re-encoded it will be parseable +/// as a TenantId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct TenantShardId { + pub tenant_id: TenantId, + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + +impl ShardCount { + pub const MAX: Self = Self(u8::MAX); + pub const MIN: Self = Self(0); + + /// The internal value of a ShardCount may be zero, which means "1 shard, but use + /// legacy format for TenantShardId that excludes the shard suffix", also known + /// as [`TenantShardId::unsharded`]. + /// + /// This method returns the actual number of shards, i.e. if our internal value is + /// zero, we return 1 (unsharded tenants have 1 shard). + pub fn count(&self) -> u8 { + if self.0 > 0 { + self.0 + } else { + 1 + } + } + + /// The literal internal value: this is **not** the number of shards in the + /// tenant, as we have a special zero value for legacy unsharded tenants. Use + /// [`Self::count`] if you want to know the cardinality of shards. + pub fn literal(&self) -> u8 { + self.0 + } + + /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but + /// uses the legacy format for `TenantShardId`. See also the documentation for + /// [`Self::count`]. + pub fn is_unsharded(&self) -> bool { + self.0 == 0 + } + + /// `v` may be zero, or the number of shards in the tenant. `v` is what + /// [`Self::literal`] would return. + pub const fn new(val: u8) -> Self { + Self(val) + } +} + +impl ShardNumber { + pub const MAX: Self = Self(u8::MAX); +} + +impl TenantShardId { + pub fn unsharded(tenant_id: TenantId) -> Self { + Self { + tenant_id, + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + } + } + + /// The range of all TenantShardId that belong to a particular TenantId. This is useful when + /// you have a BTreeMap of TenantShardId, and are querying by TenantId. + pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive { + RangeInclusive::new( + Self { + tenant_id, + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + }, + Self { + tenant_id, + shard_number: ShardNumber::MAX, + shard_count: ShardCount::MAX, + }, + ) + } + + pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { + ShardSlug(self) + } + + /// Convenience for code that has special behavior on the 0th shard. + pub fn is_shard_zero(&self) -> bool { + self.shard_number == ShardNumber(0) + } + + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. + pub fn is_unsharded(&self) -> bool { + self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() + } + + /// Convenience for dropping the tenant_id and just getting the ShardIndex: this + /// is useful when logging from code that is already in a span that includes tenant ID, to + /// keep messages reasonably terse. + pub fn to_index(&self) -> ShardIndex { + ShardIndex { + shard_number: self.shard_number, + shard_count: self.shard_count, + } + } + + /// Calculate the children of this TenantShardId when splitting the overall tenant into + /// the given number of shards. + pub fn split(&self, new_shard_count: ShardCount) -> Vec { + let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1); + let mut child_shards = Vec::new(); + for shard_number in 0..ShardNumber(new_shard_count.0).0 { + // Key mapping is based on a round robin mapping of key hash modulo shard count, + // so our child shards are the ones which the same keys would map to. + if shard_number % effective_old_shard_count == self.shard_number.0 { + child_shards.push(TenantShardId { + tenant_id: self.tenant_id, + shard_number: ShardNumber(shard_number), + shard_count: new_shard_count, + }) + } + } + + child_shards + } +} + +impl<'a> std::fmt::Display for ShardSlug<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:02x}{:02x}", + self.0.shard_number.0, self.0.shard_count.0 + ) + } +} + +impl std::fmt::Display for TenantShardId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.shard_count != ShardCount(0) { + write!(f, "{}-{}", self.tenant_id, self.shard_slug()) + } else { + // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this + // is distinct from the normal single shard case (shard count == 1). + self.tenant_id.fmt(f) + } + } +} + +impl std::fmt::Debug for TenantShardId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // Debug is the same as Display: the compact hex representation + write!(f, "{}", self) + } +} + +impl std::str::FromStr for TenantShardId { + type Err = hex::FromHexError; + + fn from_str(s: &str) -> Result { + // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count + if s.len() == 32 { + // Legacy case: no shard specified + Ok(Self { + tenant_id: TenantId::from_str(s)?, + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + }) + } else if s.len() == 37 { + let bytes = s.as_bytes(); + let tenant_id = TenantId::from_hex(&bytes[0..32])?; + let mut shard_parts: [u8; 2] = [0u8; 2]; + hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?; + Ok(Self { + tenant_id, + shard_number: ShardNumber(shard_parts[0]), + shard_count: ShardCount(shard_parts[1]), + }) + } else { + Err(hex::FromHexError::InvalidStringLength) + } + } +} + +impl From<[u8; 18]> for TenantShardId { + fn from(b: [u8; 18]) -> Self { + let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap(); + + Self { + tenant_id: TenantId::from(tenant_id_bytes), + shard_number: ShardNumber(b[16]), + shard_count: ShardCount(b[17]), + } + } +} + +impl ShardIndex { + pub fn new(number: ShardNumber, count: ShardCount) -> Self { + Self { + shard_number: number, + shard_count: count, + } + } + pub fn unsharded() -> Self { + Self { + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + } + } + + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. + pub fn is_unsharded(&self) -> bool { + self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) + } + + /// For use in constructing remote storage paths: concatenate this with a TenantId + /// to get a fully qualified TenantShardId. + /// + /// Backward compat: this function returns an empty string if Self::is_unsharded, such + /// that the legacy pre-sharding remote key format is preserved. + pub fn get_suffix(&self) -> String { + if self.is_unsharded() { + "".to_string() + } else { + format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0) + } + } +} + +impl std::fmt::Display for ShardIndex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0) + } +} + +impl std::fmt::Debug for ShardIndex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // Debug is the same as Display: the compact hex representation + write!(f, "{}", self) + } +} + +impl std::str::FromStr for ShardIndex { + type Err = hex::FromHexError; + + fn from_str(s: &str) -> Result { + // Expect format: 1 byte shard number, 1 byte shard count + if s.len() == 4 { + let bytes = s.as_bytes(); + let mut shard_parts: [u8; 2] = [0u8; 2]; + hex::decode_to_slice(bytes, &mut shard_parts)?; + Ok(Self { + shard_number: ShardNumber(shard_parts[0]), + shard_count: ShardCount(shard_parts[1]), + }) + } else { + Err(hex::FromHexError::InvalidStringLength) + } + } +} + +impl From<[u8; 2]> for ShardIndex { + fn from(b: [u8; 2]) -> Self { + Self { + shard_number: ShardNumber(b[0]), + shard_count: ShardCount(b[1]), + } + } +} + +impl Serialize for TenantShardId { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if serializer.is_human_readable() { + serializer.collect_str(self) + } else { + // Note: while human encoding of [`TenantShardId`] is backward and forward + // compatible, this binary encoding is not. + let mut packed: [u8; 18] = [0; 18]; + packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); + packed[16] = self.shard_number.0; + packed[17] = self.shard_count.0; + + packed.serialize(serializer) + } + } +} + +impl<'de> Deserialize<'de> for TenantShardId { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct IdVisitor { + is_human_readable_deserializer: bool, + } + + impl<'de> serde::de::Visitor<'de> for IdVisitor { + type Value = TenantShardId; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + if self.is_human_readable_deserializer { + formatter.write_str("value in form of hex string") + } else { + formatter.write_str("value in form of integer array([u8; 18])") + } + } + + fn visit_seq(self, seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let s = serde::de::value::SeqAccessDeserializer::new(seq); + let id: [u8; 18] = Deserialize::deserialize(s)?; + Ok(TenantShardId::from(id)) + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + TenantShardId::from_str(v).map_err(E::custom) + } + } + + if deserializer.is_human_readable() { + deserializer.deserialize_str(IdVisitor { + is_human_readable_deserializer: true, + }) + } else { + deserializer.deserialize_tuple( + 18, + IdVisitor { + is_human_readable_deserializer: false, + }, + ) + } + } +} + +impl Serialize for ShardIndex { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if serializer.is_human_readable() { + serializer.collect_str(self) + } else { + // Binary encoding is not used in index_part.json, but is included in anticipation of + // switching various structures (e.g. inter-process communication, remote metadata) to more + // compact binary encodings in future. + let mut packed: [u8; 2] = [0; 2]; + packed[0] = self.shard_number.0; + packed[1] = self.shard_count.0; + packed.serialize(serializer) + } + } +} + +impl<'de> Deserialize<'de> for ShardIndex { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct IdVisitor { + is_human_readable_deserializer: bool, + } + + impl<'de> serde::de::Visitor<'de> for IdVisitor { + type Value = ShardIndex; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + if self.is_human_readable_deserializer { + formatter.write_str("value in form of hex string") + } else { + formatter.write_str("value in form of integer array([u8; 2])") + } + } + + fn visit_seq(self, seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let s = serde::de::value::SeqAccessDeserializer::new(seq); + let id: [u8; 2] = Deserialize::deserialize(s)?; + Ok(ShardIndex::from(id)) + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + ShardIndex::from_str(v).map_err(E::custom) + } + } + + if deserializer.is_human_readable() { + deserializer.deserialize_str(IdVisitor { + is_human_readable_deserializer: true, + }) + } else { + deserializer.deserialize_tuple( + 2, + IdVisitor { + is_human_readable_deserializer: false, + }, + ) + } + } +} diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 156b99a010..16ec563fa7 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -78,8 +78,9 @@ impl Drop for GateGuard { } } -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum GateError { + #[error("gate is closed")] GateClosed, } diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs new file mode 100644 index 0000000000..ab5f7bdd95 --- /dev/null +++ b/libs/utils/src/toml_edit_ext.rs @@ -0,0 +1,22 @@ +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("item is not a document")] + ItemIsNotADocument, + #[error(transparent)] + Serde(toml_edit::de::Error), +} + +pub fn deserialize_item(item: &toml_edit::Item) -> Result +where + T: serde::de::DeserializeOwned, +{ + let document: toml_edit::Document = match item { + toml_edit::Item::Table(toml) => toml.clone().into(), + toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { + toml.clone().into_table().into() + } + _ => return Err(Error::ItemIsNotADocument), + }; + + toml_edit::de::from_document(document).map_err(Error::Serde) +} diff --git a/libs/walproposer/Cargo.toml b/libs/walproposer/Cargo.toml index 73aa073c44..2d442dc429 100644 --- a/libs/walproposer/Cargo.toml +++ b/libs/walproposer/Cargo.toml @@ -9,8 +9,6 @@ anyhow.workspace = true utils.workspace = true postgres_ffi.workspace = true -workspace_hack.workspace = true - [build-dependencies] anyhow.workspace = true bindgen.workspace = true diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index f7b72b205f..37b1e0fa87 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -1,3 +1,5 @@ +#![allow(clippy::todo)] + use std::ffi::CString; use crate::{ diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 4335f38f1e..0e748ee3db 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -49,6 +49,7 @@ postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true rand.workspace = true +range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true scopeguard.workspace = true serde.workspace = true @@ -62,6 +63,7 @@ sync_wrapper.workspace = true sysinfo.workspace = true tokio-tar.workspace = true thiserror.workspace = true +tikv-jemallocator.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } tokio-epoll-uring.workspace = true tokio-io-timeout.workspace = true @@ -106,3 +108,7 @@ harness = false [[bench]] name = "bench_walredo" harness = false + +[[bench]] +name = "bench_ingest" +harness = false diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs new file mode 100644 index 0000000000..bd99f5289d --- /dev/null +++ b/pageserver/benches/bench_ingest.rs @@ -0,0 +1,254 @@ +use std::{env, num::NonZeroUsize}; + +use bytes::Bytes; +use camino::Utf8PathBuf; +use criterion::{criterion_group, criterion_main, Criterion}; +use pageserver::{ + config::PageServerConf, + context::{DownloadBehavior, RequestContext}, + l0_flush::{L0FlushConfig, L0FlushGlobalState}, + page_cache, + repository::Value, + task_mgr::TaskKind, + tenant::storage_layer::inmemory_layer::SerializedBatch, + tenant::storage_layer::InMemoryLayer, + virtual_file, +}; +use pageserver_api::{key::Key, shard::TenantShardId}; +use utils::{ + bin_ser::BeSer, + id::{TenantId, TimelineId}, +}; + +// A very cheap hash for generating non-sequential keys. +fn murmurhash32(mut h: u32) -> u32 { + h ^= h >> 16; + h = h.wrapping_mul(0x85ebca6b); + h ^= h >> 13; + h = h.wrapping_mul(0xc2b2ae35); + h ^= h >> 16; + h +} + +enum KeyLayout { + /// Sequential unique keys + Sequential, + /// Random unique keys + Random, + /// Random keys, but only use the bits from the mask of them + RandomReuse(u32), +} + +enum WriteDelta { + Yes, + No, +} + +async fn ingest( + conf: &'static PageServerConf, + put_size: usize, + put_count: usize, + key_layout: KeyLayout, + write_delta: WriteDelta, +) -> anyhow::Result<()> { + let mut lsn = utils::lsn::Lsn(1000); + let mut key = Key::from_i128(0x0); + + let timeline_id = TimelineId::generate(); + let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?; + + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + + let gate = utils::sync::gate::Gate::default(); + let entered = gate.enter().unwrap(); + + let layer = + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?; + + let data = Value::Image(Bytes::from(vec![0u8; put_size])); + let data_ser_size = data.serialized_size().unwrap() as usize; + let ctx = RequestContext::new( + pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler, + pageserver::context::DownloadBehavior::Download, + ); + + const BATCH_SIZE: usize = 16; + let mut batch = Vec::new(); + + for i in 0..put_count { + lsn += put_size as u64; + + // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people + // usually care the most about write performance when they're blasting a huge batch of data into a huge table. + match key_layout { + KeyLayout::Sequential => { + // Use sequential order to illustrate the experience a user is likely to have + // when ingesting bulk data. + key.field6 = i as u32; + } + KeyLayout::Random => { + // Use random-order keys to avoid giving a false advantage to data structures that are + // faster when inserting on the end. + key.field6 = murmurhash32(i as u32); + } + KeyLayout::RandomReuse(mask) => { + // Use low bits only, to limit cardinality + key.field6 = murmurhash32(i as u32) & mask; + } + } + + batch.push((key.to_compact(), lsn, data_ser_size, data.clone())); + if batch.len() >= BATCH_SIZE { + let this_batch = std::mem::take(&mut batch); + let serialized = SerializedBatch::from_values(this_batch); + layer.put_batch(serialized, &ctx).await?; + } + } + if !batch.is_empty() { + let this_batch = std::mem::take(&mut batch); + let serialized = SerializedBatch::from_values(this_batch); + layer.put_batch(serialized, &ctx).await?; + } + layer.freeze(lsn + 1).await; + + if matches!(write_delta, WriteDelta::Yes) { + let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct { + max_concurrency: NonZeroUsize::new(1).unwrap(), + }); + let (_desc, path) = layer + .write_to_disk(&ctx, None, l0_flush_state.inner()) + .await? + .unwrap(); + tokio::fs::remove_file(path).await?; + } + + Ok(()) +} + +/// Wrapper to instantiate a tokio runtime +fn ingest_main( + conf: &'static PageServerConf, + put_size: usize, + put_count: usize, + key_layout: KeyLayout, + write_delta: WriteDelta, +) { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + runtime.block_on(async move { + let r = ingest(conf, put_size, put_count, key_layout, write_delta).await; + if let Err(e) = r { + panic!("{e:?}"); + } + }); +} + +/// Declare a series of benchmarks for the Pageserver's ingest write path. +/// +/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either +/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set). +/// +/// Genuine disk I/O is used, so expect results to differ depending on storage. However, when running on +/// a fast disk, CPU is the bottleneck at time of writing. +fn criterion_benchmark(c: &mut Criterion) { + let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap(); + let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap(); + eprintln!("Data directory: {}", temp_dir.path()); + + let conf: &'static PageServerConf = Box::leak(Box::new( + pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()), + )); + virtual_file::init(16384, virtual_file::io_engine_for_bench()); + page_cache::init(conf.page_cache_size); + + { + let mut group = c.benchmark_group("ingest-small-values"); + let put_size = 100usize; + let put_count = 128 * 1024 * 1024 / put_size; + group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64)); + group.sample_size(10); + group.bench_function("ingest 128MB/100b seq", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b rand", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Random, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b rand-1024keys", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::RandomReuse(0x3ff), + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b seq, no delta", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::No, + ) + }) + }); + } + + { + let mut group = c.benchmark_group("ingest-big-values"); + let put_size = 8192usize; + let put_count = 128 * 1024 * 1024 / put_size; + group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64)); + group.sample_size(10); + group.bench_function("ingest 128MB/8k seq", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/8k seq, no delta", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::No, + ) + }) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 1d02aa7709..1353e79f7c 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,3 +1,4 @@ +use criterion::measurement::WallTime; use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; @@ -15,7 +16,11 @@ use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion}; + +fn fixture_path(relative: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) +} fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut layer_map = LayerMap::default(); @@ -109,7 +114,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning // between each test run. fn bench_from_captest_env(c: &mut Criterion) { // TODO consider compressing this file - let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); // Test with uniform query pattern @@ -139,7 +144,7 @@ fn bench_from_captest_env(c: &mut Criterion) { fn bench_from_real_project(c: &mut Criterion) { // Init layer map let now = Instant::now(); - let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); println!("Finished layer map init in {:?}", now.elapsed()); // Choose uniformly distributed queries @@ -242,7 +247,72 @@ fn bench_sequential(c: &mut Criterion) { group.finish(); } +fn bench_visibility_with_map( + group: &mut BenchmarkGroup, + layer_map: LayerMap, + read_points: Vec, + bench_name: &str, +) { + group.bench_function(bench_name, |b| { + b.iter(|| black_box(layer_map.get_visibility(read_points.clone()))); + }); +} + +// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines. +fn bench_visibility(c: &mut Criterion) { + let mut group = c.benchmark_group("visibility"); + { + // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines. + let now = Instant::now(); + let mut layer_map = LayerMap::default(); + let mut updates = layer_map.batch_update(); + for i in 0..100_000 { + let i32 = (i as u32) % 100; + let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); + let layer = PersistentLayerDesc::new_img( + TenantShardId::unsharded(TenantId::generate()), + TimelineId::generate(), + zero.add(10 * i32)..zero.add(10 * i32 + 1), + Lsn(i), + 0, + ); + updates.insert_historic(layer); + } + updates.flush(); + println!("Finished layer map init in {:?}", now.elapsed()); + + let mut read_points = Vec::new(); + for i in (0..100_000).step_by(1000) { + read_points.push(Lsn(i)); + } + + bench_visibility_with_map(&mut group, layer_map, read_points, "sequential"); + } + + { + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); + let read_points = vec![Lsn(0x1C760FA190)]; + bench_visibility_with_map(&mut group, layer_map, read_points, "real_map"); + + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); + let read_points = vec![ + Lsn(0x1C760FA190), + Lsn(0x000000931BEAD539), + Lsn(0x000000931BF63011), + Lsn(0x000000931B33AE68), + Lsn(0x00000038E67ABFA0), + Lsn(0x000000931B33AE68), + Lsn(0x000000914E3F38F0), + Lsn(0x000000931B33AE68), + ]; + bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches"); + } + + group.finish(); +} + criterion_group!(group_1, bench_from_captest_env); criterion_group!(group_2, bench_from_real_project); criterion_group!(group_3, bench_sequential); -criterion_main!(group_1, group_2, group_3); +criterion_group!(group_4, bench_visibility); +criterion_main!(group_1, group_2, group_3, group_4); diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 5aab10e5d9..edc09d0bf2 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -48,6 +48,7 @@ //! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] //! ``` +use anyhow::Context; use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; @@ -188,6 +189,7 @@ impl Request { manager .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version) .await + .context("request_redo") } fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index 0ed27602cd..a938367334 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -8,7 +8,7 @@ license.workspace = true pageserver_api.workspace = true thiserror.workspace = true async-trait.workspace = true -reqwest.workspace = true +reqwest = { workspace = true, features = [ "stream" ] } utils.workspace = true serde.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 69b86d9c46..ac3ff1bb89 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use bytes::Bytes; +use detach_ancestor::AncestorDetached; use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method, StatusCode}; use utils::{ @@ -9,6 +10,8 @@ use utils::{ lsn::Lsn, }; +pub use reqwest::Body as ReqwestBody; + pub mod util; #[derive(Debug, Clone)] @@ -20,6 +23,9 @@ pub struct Client { #[derive(thiserror::Error, Debug)] pub enum Error { + #[error("send request: {0}")] + SendRequest(reqwest::Error), + #[error("receive body: {0}")] ReceiveBody(reqwest::Error), @@ -173,19 +179,30 @@ impl Client { self.request(Method::GET, uri, ()).await } + fn start_request( + &self, + method: Method, + uri: U, + ) -> reqwest::RequestBuilder { + let req = self.client.request(method, uri); + if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + } + } + async fn request_noerror( &self, method: Method, uri: U, body: B, ) -> Result { - let req = self.client.request(method, uri); - let req = if let Some(value) = &self.authorization_header { - req.header(reqwest::header::AUTHORIZATION, value) - } else { - req - }; - req.json(&body).send().await.map_err(Error::ReceiveBody) + self.start_request(method, uri) + .json(&body) + .send() + .await + .map_err(Error::ReceiveBody) } async fn request( @@ -205,15 +222,6 @@ impl Client { Ok(()) } - pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result { - let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint); - self.request(Method::POST, &uri, req) - .await? - .json() - .await - .map_err(Error::ReceiveBody) - } - /// The tenant deletion API can return 202 if deletion is incomplete, or /// 404 if it is complete. Callers are responsible for checking the status /// code and retrying. Error codes other than 404 will return Err(). @@ -411,6 +419,23 @@ impl Client { } } + pub async fn timeline_detach_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor", + self.mgmt_api_endpoint + ); + + self.request(Method::PUT, &uri, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", @@ -618,4 +643,53 @@ impl Client { }), } } + + pub async fn import_basebackup( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + base_lsn: Lsn, + end_lsn: Lsn, + pg_version: u32, + basebackup_tarball: ReqwestBody, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}", + self.mgmt_api_endpoint, + ); + self.start_request(Method::PUT, uri) + .body(basebackup_tarball) + .send() + .await + .map_err(Error::SendRequest)? + .error_from_body() + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn import_wal( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + start_lsn: Lsn, + end_lsn: Lsn, + wal_tarball: ReqwestBody, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}", + self.mgmt_api_endpoint, + ); + self.start_request(Method::PUT, uri) + .body(wal_tarball) + .send() + .await + .map_err(Error::SendRequest)? + .error_from_body() + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 35519b5d0a..5bc9b5ca1d 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -131,7 +131,7 @@ impl CompactionKey for Key { pub type CompactionKeySpace = Vec>; /// Functions needed from all layers. -pub trait CompactionLayer { +pub trait CompactionLayer { fn key_range(&self) -> &Range; fn lsn_range(&self) -> &Range; diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 389519c65a..bc939f9688 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -83,10 +83,18 @@ fn parse_filename(name: &str) -> (Range, Range) { let keys: Vec<&str> = split[0].split('-').collect(); let mut lsns: Vec<&str> = split[1].split('-').collect(); + // The current format of the layer file name: 000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001 + + // Handle generation number `-00000001` part if lsns.last().expect("should").len() == 8 { lsns.pop(); } + // Handle version number `-v1` part + if lsns.last().expect("should").starts_with('v') { + lsns.pop(); + } + if lsns.len() == 1 { lsns.push(lsns[0]); } diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 50c3ac4c61..3fabf62987 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -178,8 +178,8 @@ async fn main() -> anyhow::Result<()> { let toml_item = toml_document .get("remote_storage") .expect("need remote_storage"); - let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config"); - let storage = remote_storage::GenericRemoteStorage::from_config(&config); + let config = RemoteStorageConfig::from_toml(toml_item)?; + let storage = remote_storage::GenericRemoteStorage::from_config(&config).await; let cancel = CancellationToken::new(); storage .unwrap() diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs index 4785c8c4c5..9e3dedb75a 100644 --- a/pageserver/src/auth.rs +++ b/pageserver/src/auth.rs @@ -14,12 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope - (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError( - format!( - "JWT scope '{:?}' is ineligible for Pageserver auth", - claims.scope - ) - .into(), - )), + (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => { + Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Pageserver auth", + claims.scope + ) + .into(), + )) + } } } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 0f057a4368..207f781e1b 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -348,35 +348,36 @@ where self.add_rel(rel, rel).await?; } } - - for (path, content) in self - .timeline - .list_aux_files(self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? - { - if path.starts_with("pg_replslot") { - let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; - let restart_lsn = Lsn(u64::from_le_bytes( - content[offs..offs + 8].try_into().unwrap(), - )); - info!("Replication slot {} restart LSN={}", path, restart_lsn); - min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn); - } else if path == "pg_logical/replorigin_checkpoint" { - // replorigin_checkoint is written only on compute shutdown, so it contains - // deteriorated values. So we generate our own version of this file for the particular LSN - // based on information about replorigins extracted from transaction commit records. - // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, - // but now we should handle (skip) it for backward compatibility. - continue; - } - let header = new_tar_header(&path, content.len() as u64)?; - self.ar - .append(&header, &*content) - .await - .context("could not add aux file to basebackup tarball")?; - } } + + for (path, content) in self + .timeline + .list_aux_files(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? + { + if path.starts_with("pg_replslot") { + let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; + let restart_lsn = Lsn(u64::from_le_bytes( + content[offs..offs + 8].try_into().unwrap(), + )); + info!("Replication slot {} restart LSN={}", path, restart_lsn); + min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn); + } else if path == "pg_logical/replorigin_checkpoint" { + // replorigin_checkoint is written only on compute shutdown, so it contains + // deteriorated values. So we generate our own version of this file for the particular LSN + // based on information about replorigins extracted from transaction commit records. + // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, + // but now we should handle (skip) it for backward compatibility. + continue; + } + let header = new_tar_header(&path, content.len() as u64)?; + self.ar + .append(&header, &*content) + .await + .context("could not add aux file to basebackup tarball")?; + } + if min_restart_lsn != Lsn::MAX { info!( "Min restart LSN for logical replication is {}", diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ba5b2608bd..da0c11d9bf 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -2,35 +2,36 @@ //! Main entry point for the Page Server executable. +use std::env; use std::env::{var, VarError}; use std::io::Read; use std::sync::Arc; use std::time::Duration; -use std::{env, ops::ControlFlow, str::FromStr}; use anyhow::{anyhow, Context}; use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; +use pageserver::config::PageserverIdentity; use pageserver::control_plane_client::ControlPlaneClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; -use pageserver::task_mgr::WALRECEIVER_RUNTIME; +use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME}; use pageserver::tenant::{secondary, TenantSharedResources}; +use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener}; use remote_storage::GenericRemoteStorage; use tokio::signal::unix::SignalKind; use tokio::time::Instant; +use tokio_util::sync::CancellationToken; use tracing::*; use metrics::set_build_info_metric; use pageserver::{ - config::{defaults::*, PageServerConf}, - context::{DownloadBehavior, RequestContext}, + config::PageServerConf, deletion_queue::DeletionQueue, http, page_cache, page_service, task_mgr, - task_mgr::TaskKind, - task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME}, + task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME}, tenant::mgr, virtual_file, }; @@ -47,6 +48,9 @@ use utils::{ project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + const PID_FILE_NAME: &str = "pageserver.pid"; const FEATURES: &[&str] = &[ @@ -81,18 +85,13 @@ fn main() -> anyhow::Result<()> { .with_context(|| format!("Error opening workdir '{workdir}'"))?; let cfg_file_path = workdir.join("pageserver.toml"); + let identity_file_path = workdir.join("identity.toml"); // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir) .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?; - let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? { - ControlFlow::Continue(conf) => conf, - ControlFlow::Break(()) => { - info!("Pageserver config init successful"); - return Ok(()); - } - }; + let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?; // Initialize logging. // @@ -124,8 +123,8 @@ fn main() -> anyhow::Result<()> { // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); - info!(?conf.get_impl, "starting with get page implementation"); - info!(?conf.get_vectored_impl, "starting with vectored get page implementation"); + info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); + info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access"); let tenants_path = conf.tenants_path(); if !tenants_path.exists() { @@ -147,70 +146,55 @@ fn main() -> anyhow::Result<()> { } fn initialize_config( + identity_file_path: &Utf8Path, cfg_file_path: &Utf8Path, - arg_matches: clap::ArgMatches, workdir: &Utf8Path, -) -> anyhow::Result> { - let init = arg_matches.get_flag("init"); - - let file_contents: Option = match std::fs::File::open(cfg_file_path) { +) -> anyhow::Result<&'static PageServerConf> { + // The deployment orchestrator writes out an indentity file containing the node id + // for all pageservers. This file is the source of truth for the node id. In order + // to allow for rolling back pageserver releases, the node id is also included in + // the pageserver config that the deployment orchestrator writes to disk for the pageserver. + // A rolled back version of the pageserver will get the node id from the pageserver.toml + // config file. + let identity = match std::fs::File::open(identity_file_path) { Ok(mut f) => { - if init { - anyhow::bail!("config file already exists: {cfg_file_path}"); + let md = f.metadata().context("stat config file")?; + if !md.is_file() { + anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..."); } + + let mut s = String::new(); + f.read_to_string(&mut s).context("read identity file")?; + toml_edit::de::from_str::(&s)? + } + Err(e) => { + anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..."); + } + }; + + let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) { + Ok(mut f) => { let md = f.metadata().context("stat config file")?; if md.is_file() { let mut s = String::new(); f.read_to_string(&mut s).context("read config file")?; - Some(s.parse().context("parse config file toml")?) + s.parse().context("parse config file toml")? } else { anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}"); } } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => None, Err(e) => { anyhow::bail!("open pageserver config: {e}: {cfg_file_path}"); } }; - let mut effective_config = file_contents.unwrap_or_else(|| { - DEFAULT_CONFIG_FILE - .parse() - .expect("unit tests ensure this works") - }); - - // Patch with overrides from the command line - if let Some(values) = arg_matches.get_many::("config-override") { - for option_line in values { - let doc = toml_edit::Document::from_str(option_line).with_context(|| { - format!("Option '{option_line}' could not be parsed as a toml document") - })?; - - for (key, item) in doc.iter() { - effective_config.insert(key, item.clone()); - } - } - } - - debug!("Resulting toml: {effective_config}"); + debug!("Using pageserver toml: {config}"); // Construct the runtime representation - let conf = PageServerConf::parse_and_validate(&effective_config, workdir) + let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir) .context("Failed to parse pageserver configuration")?; - if init { - info!("Writing pageserver config to '{cfg_file_path}'"); - - std::fs::write(cfg_file_path, effective_config.to_string()) - .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?; - info!("Config successfully written to '{cfg_file_path}'") - } - - Ok(if init { - ControlFlow::Break(()) - } else { - ControlFlow::Continue(Box::leak(Box::new(conf))) - }) + Ok(Box::leak(Box::new(conf))) } struct WaitForPhaseResult { @@ -302,6 +286,7 @@ fn start_pageserver( // Create and lock PID file. This ensures that there cannot be more than one // pageserver process running at the same time. let lock_file_path = conf.workdir.join(PID_FILE_NAME); + info!("Claiming pid file at {lock_file_path:?}..."); let lock_file = utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; info!("Claimed pid file at {lock_file_path:?}"); @@ -382,7 +367,7 @@ fn start_pageserver( let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); // Set up remote storage client - let remote_storage = create_remote_storage_client(conf)?; + let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?; // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( @@ -421,14 +406,21 @@ fn start_pageserver( background_jobs_can_start: background_jobs_barrier.clone(), }; + info!(config=?conf.l0_flush, "using l0_flush config"); + let l0_flush_global_state = + pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone()); + // Scan the local 'tenants/' directory and start loading the tenants let deletion_queue_client = deletion_queue.new_client(); + let background_purges = mgr::BackgroundPurges::default(); let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( conf, + background_purges.clone(), TenantSharedResources { broker_client: broker_client.clone(), remote_storage: remote_storage.clone(), deletion_queue_client, + l0_flush_global_state, }, order, shutdown_pageserver.clone(), @@ -515,7 +507,7 @@ fn start_pageserver( } }); - let secondary_controller = secondary::spawn_tasks( + let (secondary_controller, secondary_controller_tasks) = secondary::spawn_tasks( tenant_manager.clone(), remote_storage.clone(), background_jobs_barrier.clone(), @@ -528,18 +520,19 @@ fn start_pageserver( // been configured. let disk_usage_eviction_state: Arc = Arc::default(); - launch_disk_usage_global_eviction_task( + let disk_usage_eviction_task = launch_disk_usage_global_eviction_task( conf, remote_storage.clone(), disk_usage_eviction_state.clone(), tenant_manager.clone(), background_jobs_barrier.clone(), - )?; + ); // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. - { - let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); + let http_endpoint_listener = { + let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper + let cancel = CancellationToken::new(); let router_state = Arc::new( http::routes::State::new( @@ -560,110 +553,49 @@ fn start_pageserver( let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) - .with_graceful_shutdown(task_mgr::shutdown_watcher()); + .with_graceful_shutdown({ + let cancel = cancel.clone(); + async move { cancel.clone().cancelled().await } + }); - task_mgr::spawn( - MGMT_REQUEST_RUNTIME.handle(), - TaskKind::HttpEndpointListener, - None, - None, + let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "http endpoint listener", - true, - async { - server.await?; - Ok(()) - }, - ); - } + server, + )); + HttpEndpointListener(CancellableTask { task, cancel }) + }; - if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint { - let metrics_ctx = RequestContext::todo_child( - TaskKind::MetricsCollection, - // This task itself shouldn't download anything. - // The actual size calculation does need downloads, and - // creates a child context with the right DownloadBehavior. - DownloadBehavior::Error, - ); + let consumption_metrics_tasks = { + let cancel = shutdown_pageserver.child_token(); + let task = crate::BACKGROUND_RUNTIME.spawn({ + let tenant_manager = tenant_manager.clone(); + let cancel = cancel.clone(); + async move { + // first wait until background jobs are cleared to launch. + // + // this is because we only process active tenants and timelines, and the + // Timeline::get_current_logical_size will spawn the logical size calculation, + // which will not be rate-limited. + tokio::select! { + _ = cancel.cancelled() => { return; }, + _ = background_jobs_barrier.wait() => {} + }; - let local_disk_storage = conf.workdir.join("last_consumption_metrics.json"); - - task_mgr::spawn( - crate::BACKGROUND_RUNTIME.handle(), - TaskKind::MetricsCollection, - None, - None, - "consumption metrics collection", - true, - { - let tenant_manager = tenant_manager.clone(); - async move { - // first wait until background jobs are cleared to launch. - // - // this is because we only process active tenants and timelines, and the - // Timeline::get_current_logical_size will spawn the logical size calculation, - // which will not be rate-limited. - let cancel = task_mgr::shutdown_token(); - - tokio::select! { - _ = cancel.cancelled() => { return Ok(()); }, - _ = background_jobs_barrier.wait() => {} - }; - - pageserver::consumption_metrics::collect_metrics( - tenant_manager, - metric_collection_endpoint, - &conf.metric_collection_bucket, - conf.metric_collection_interval, - conf.cached_metric_collection_interval, - conf.synthetic_size_calculation_interval, - conf.id, - local_disk_storage, - cancel, - metrics_ctx, - ) - .instrument(info_span!("metrics_collection")) - .await?; - Ok(()) - } - }, - ); - } + pageserver::consumption_metrics::run(conf, tenant_manager, cancel).await; + } + }); + ConsumptionMetricsTasks(CancellableTask { task, cancel }) + }; // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. We created the listener earlier already. - { - let libpq_ctx = RequestContext::todo_child( - TaskKind::LibpqEndpointListener, - // listener task shouldn't need to download anything. (We will - // create a separate sub-contexts for each connection, with their - // own download behavior. This context is used only to listen and - // accept connections.) - DownloadBehavior::Error, - ); - task_mgr::spawn( - COMPUTE_REQUEST_RUNTIME.handle(), - TaskKind::LibpqEndpointListener, - None, - None, - "libpq endpoint listener", - true, - { - let tenant_manager = tenant_manager.clone(); - async move { - page_service::libpq_listener_main( - tenant_manager, - broker_client, - pg_auth, - pageserver_listener, - conf.pg_auth_type, - libpq_ctx, - task_mgr::shutdown_token(), - ) - .await - } - }, - ); - } + let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, { + let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it + pageserver_listener + .set_nonblocking(true) + .context("set listener to nonblocking")?; + tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")? + }); let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); @@ -689,13 +621,24 @@ fn start_pageserver( // Right now that tree doesn't reach very far, and `task_mgr` is used instead. // The plan is to change that over time. shutdown_pageserver.take(); - pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await; + pageserver::shutdown_pageserver( + http_endpoint_listener, + page_service, + consumption_metrics_tasks, + disk_usage_eviction_task, + &tenant_manager, + background_purges, + deletion_queue.clone(), + secondary_controller_tasks, + 0, + ) + .await; unreachable!() }) } } -fn create_remote_storage_client( +async fn create_remote_storage_client( conf: &'static PageServerConf, ) -> anyhow::Result { let config = if let Some(config) = &conf.remote_storage_config { @@ -705,7 +648,7 @@ fn create_remote_storage_client( }; // Create the client - let mut remote_storage = GenericRemoteStorage::from_config(config)?; + let mut remote_storage = GenericRemoteStorage::from_config(config).await?; // If `test_remote_failures` is non-zero, wrap the client with a // wrapper that simulates failures. @@ -728,28 +671,12 @@ fn cli() -> Command { Command::new("Neon page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(version()) - .arg( - Arg::new("init") - .long("init") - .action(ArgAction::SetTrue) - .help("Initialize pageserver with all given config overrides"), - ) .arg( Arg::new("workdir") .short('D') .long("workdir") .help("Working directory for the pageserver"), ) - // See `settings.md` for more details on the extra configuration patameters pageserver can process - .arg( - Arg::new("config-override") - .long("config-override") - .short('c') - .num_args(1) - .action(ArgAction::Append) - .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \ - Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), - ) .arg( Arg::new("enabled-features") .long("enabled-features") diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b4a0d1ac02..0ebaf78840 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,14 +5,13 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; -use pageserver_api::shard::TenantShardId; +use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId}; use remote_storage::{RemotePath, RemoteStorageConfig}; -use serde; use serde::de::IntoDeserializer; +use serde::{self, Deserialize}; use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; -use utils::id::ConnectionId; use utils::logging::SecretString; use once_cell::sync::OnceCell; @@ -30,18 +29,14 @@ use utils::{ logging::LogFormat, }; -use crate::tenant::timeline::GetVectoredImpl; +use crate::l0_flush::L0FlushConfig; +use crate::tenant::config::TenantConfOpt; +use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; -use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; -use crate::tenant::{ - TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, -}; +use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine}; use crate::{tenant::config::TenantConf, virtual_file}; -use crate::{ - IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, - TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, -}; +use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX}; use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; @@ -57,7 +52,7 @@ pub mod defaults { }; pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; - pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; @@ -73,7 +68,6 @@ pub mod defaults { super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; - pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s"; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; @@ -89,13 +83,15 @@ pub mod defaults { #[cfg(not(target_os = "linux"))] pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs"; - pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential"; + pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored"; - pub const DEFAULT_GET_IMPL: &str = "legacy"; + pub const DEFAULT_GET_IMPL: &str = "vectored"; pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB - pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true; + pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)"; + + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; @@ -125,7 +121,6 @@ pub mod defaults { #concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}' #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}' -#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} @@ -136,14 +131,8 @@ pub mod defaults { #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}' -#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}' - -#get_impl = '{DEFAULT_GET_IMPL}' - #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}' -#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' - [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -164,7 +153,7 @@ pub mod defaults { #ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB} -[remote_storage] +#[remote_storage] "# ); @@ -240,7 +229,6 @@ pub struct PageServerConf { // How often to collect metrics and send them to the metrics endpoint. pub metric_collection_interval: Duration, // How often to send unchanged cached metrics to the metrics endpoint. - pub cached_metric_collection_interval: Duration, pub metric_collection_endpoint: Option, pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, @@ -282,13 +270,9 @@ pub struct PageServerConf { pub virtual_file_io_engine: virtual_file::IoEngineKind, - pub get_vectored_impl: GetVectoredImpl, - - pub get_impl: GetImpl, - pub max_vectored_read_bytes: MaxVectoredReadBytes, - pub validate_vectored_get: bool, + pub image_compression: ImageCompressionAlgorithm, /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this /// is exceeded, we start proactively closing ephemeral layers to limit the total amount @@ -296,6 +280,15 @@ pub struct PageServerConf { /// /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, + + pub l0_flush: L0FlushConfig, + + /// This flag is temporary and will be removed after gradual rollout. + /// See . + pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess, + + /// Direct IO settings + pub virtual_file_direct_io: virtual_file::DirectIoMode, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -357,8 +350,6 @@ struct PageServerConfigBuilder { auth_validation_public_key_path: BuilderValue>, remote_storage_config: BuilderValue>, - id: BuilderValue, - broker_endpoint: BuilderValue, broker_keepalive_interval: BuilderValue, @@ -368,7 +359,6 @@ struct PageServerConfigBuilder { concurrent_tenant_size_logical_size_queries: BuilderValue, metric_collection_interval: BuilderValue, - cached_metric_collection_interval: BuilderValue, metric_collection_endpoint: BuilderValue>, synthetic_size_calculation_interval: BuilderValue, metric_collection_bucket: BuilderValue>, @@ -392,18 +382,24 @@ struct PageServerConfigBuilder { virtual_file_io_engine: BuilderValue, - get_vectored_impl: BuilderValue, - - get_impl: BuilderValue, - max_vectored_read_bytes: BuilderValue, - validate_vectored_get: BuilderValue, + image_compression: BuilderValue, ephemeral_bytes_per_memory_kb: BuilderValue, + + l0_flush: BuilderValue, + + compact_level0_phase1_value_access: BuilderValue, + + virtual_file_direct_io: BuilderValue, } impl PageServerConfigBuilder { + fn new() -> Self { + Self::default() + } + #[inline(always)] fn default_values() -> Self { use self::BuilderValue::*; @@ -429,7 +425,6 @@ impl PageServerConfigBuilder { pg_auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), - id: NotSet, broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT .parse() .expect("failed to parse default broker endpoint")), @@ -448,10 +443,6 @@ impl PageServerConfigBuilder { DEFAULT_METRIC_COLLECTION_INTERVAL, ) .expect("cannot parse default metric collection interval")), - cached_metric_collection_interval: Set(humantime::parse_duration( - DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL, - ) - .expect("cannot parse default cached_metric_collection_interval")), synthetic_size_calculation_interval: Set(humantime::parse_duration( DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, ) @@ -482,13 +473,14 @@ impl PageServerConfigBuilder { virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), - get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()), - get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()), max_vectored_read_bytes: Set(MaxVectoredReadBytes( NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), - validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), + image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), + l0_flush: Set(L0FlushConfig::default()), + compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()), + virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()), } } } @@ -561,10 +553,6 @@ impl PageServerConfigBuilder { self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval) } - pub fn id(&mut self, node_id: NodeId) { - self.id = BuilderValue::Set(node_id) - } - pub fn log_format(&mut self, log_format: LogFormat) { self.log_format = BuilderValue::Set(log_format) } @@ -581,14 +569,6 @@ impl PageServerConfigBuilder { self.metric_collection_interval = BuilderValue::Set(metric_collection_interval) } - pub fn cached_metric_collection_interval( - &mut self, - cached_metric_collection_interval: Duration, - ) { - self.cached_metric_collection_interval = - BuilderValue::Set(cached_metric_collection_interval) - } - pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option) { self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) } @@ -656,27 +636,31 @@ impl PageServerConfigBuilder { self.virtual_file_io_engine = BuilderValue::Set(value); } - pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) { - self.get_vectored_impl = BuilderValue::Set(value); - } - - pub fn get_impl(&mut self, value: GetImpl) { - self.get_impl = BuilderValue::Set(value); - } - pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) { self.max_vectored_read_bytes = BuilderValue::Set(value); } - pub fn get_validate_vectored_get(&mut self, value: bool) { - self.validate_vectored_get = BuilderValue::Set(value); + pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) { + self.image_compression = BuilderValue::Set(value); } pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) { self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); } - pub fn build(self) -> anyhow::Result { + pub fn l0_flush(&mut self, value: L0FlushConfig) { + self.l0_flush = BuilderValue::Set(value); + } + + pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) { + self.compact_level0_phase1_value_access = BuilderValue::Set(value); + } + + pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) { + self.virtual_file_direct_io = BuilderValue::Set(value); + } + + pub fn build(self, id: NodeId) -> anyhow::Result { let default = Self::default_values(); macro_rules! conf { @@ -709,12 +693,10 @@ impl PageServerConfigBuilder { pg_auth_type, auth_validation_public_key_path, remote_storage_config, - id, broker_endpoint, broker_keepalive_interval, log_format, metric_collection_interval, - cached_metric_collection_interval, metric_collection_endpoint, metric_collection_bucket, synthetic_size_calculation_interval, @@ -728,14 +710,16 @@ impl PageServerConfigBuilder { heatmap_upload_concurrency, secondary_download_concurrency, ingest_batch_size, - get_vectored_impl, - get_impl, max_vectored_read_bytes, - validate_vectored_get, + image_compression, ephemeral_bytes_per_memory_kb, + l0_flush, + compact_level0_phase1_value_access, + virtual_file_direct_io, } CUSTOM LOGIC { + id: id, // TenantConf is handled separately default_tenant_conf: TenantConf::default(), concurrent_tenant_warmup: ConfigurableSemaphore::new({ @@ -811,21 +795,12 @@ impl PageServerConf { self.tenants_path().join(tenant_shard_id.to_string()) } - pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { - self.tenant_path(tenant_shard_id) - .join(IGNORED_TENANT_FILE_NAME) - } - /// Points to a place in pageserver's local directory, - /// where certain tenant's tenantconf file should be located. - /// - /// Legacy: superseded by tenant_location_config_path. Eventually - /// remove this function. - pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { - self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME) - } - - pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + /// where certain tenant's LocationConf be stored. + pub(crate) fn tenant_location_config_path( + &self, + tenant_shard_id: &TenantShardId, + ) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TENANT_LOCATION_CONFIG_NAME) } @@ -860,30 +835,6 @@ impl PageServerConf { ) } - pub(crate) fn tenant_deleted_mark_file_path( - &self, - tenant_shard_id: &TenantShardId, - ) -> Utf8PathBuf { - self.tenant_path(tenant_shard_id) - .join(TENANT_DELETED_MARKER_FILE_NAME) - } - - pub fn traces_path(&self) -> Utf8PathBuf { - self.workdir.join("traces") - } - - pub fn trace_path( - &self, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, - connection_id: &ConnectionId, - ) -> Utf8PathBuf { - self.traces_path() - .join(tenant_shard_id.to_string()) - .join(timeline_id.to_string()) - .join(connection_id.to_string()) - } - /// Turns storage remote path of a file into its local path. pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf { remote_path.with_base(&self.workdir) @@ -913,8 +864,12 @@ impl PageServerConf { /// validating the input and failing on errors. /// /// This leaves any options not present in the file in the built-in defaults. - pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result { - let mut builder = PageServerConfigBuilder::default(); + pub fn parse_and_validate( + node_id: NodeId, + toml: &Document, + workdir: &Utf8Path, + ) -> anyhow::Result { + let mut builder = PageServerConfigBuilder::new(); builder.workdir(workdir.to_owned()); let mut t_conf = TenantConfOpt::default(); @@ -940,12 +895,11 @@ impl PageServerConf { "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?), "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?) + builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?)) } "tenant_config" => { t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; } - "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), "log_format" => builder.log_format( @@ -962,13 +916,12 @@ impl PageServerConf { NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? }), "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?), - "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?), "metric_collection_endpoint" => { let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; builder.metric_collection_endpoint(Some(endpoint)); }, "metric_collection_bucket" => { - builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?) + builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?)) } "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), @@ -1011,29 +964,32 @@ impl PageServerConf { "virtual_file_io_engine" => { builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?) } - "get_vectored_impl" => { - builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?) - } - "get_impl" => { - builder.get_impl(parse_toml_from_str("get_impl", item)?) - } "max_vectored_read_bytes" => { let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize; builder.get_max_vectored_read_bytes( MaxVectoredReadBytes( NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0"))) } - "validate_vectored_get" => { - builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?) + "image_compression" => { + builder.get_image_compression(parse_toml_from_str("image_compression", item)?) } "ephemeral_bytes_per_memory_kb" => { builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) } + "l0_flush" => { + builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?) + } + "compact_level0_phase1_value_access" => { + builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?) + } + "virtual_file_direct_io" => { + builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?) + } _ => bail!("unrecognized pageserver option '{key}'"), } } - let mut conf = builder.build().context("invalid config")?; + let mut conf = builder.build(node_id).context("invalid config")?; if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf @@ -1089,7 +1045,6 @@ impl PageServerConf { eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default( ), metric_collection_interval: Duration::from_secs(60), - cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(60), @@ -1104,18 +1059,25 @@ impl PageServerConf { secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), - get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant"), ), - validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::default(), + compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + virtual_file_direct_io: virtual_file::DirectIoMode::default(), } } } +#[derive(Deserialize)] +#[serde(deny_unknown_fields)] +pub struct PageserverIdentity { + pub id: NodeId, +} + // Helper functions to parse a toml Item fn parse_toml_string(name: &str, item: &Item) -> Result { @@ -1263,10 +1225,8 @@ max_file_descriptors = 333 # initial superuser role name to use when creating a new tenant initial_superuser_name = 'zzzz' -id = 10 metric_collection_interval = '222 s' -cached_metric_collection_interval = '22200 s' metric_collection_endpoint = 'http://localhost:80/metrics' synthetic_size_calculation_interval = '333 s' @@ -1281,12 +1241,11 @@ background_task_maximum_delay = '334 s' let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; // we have to create dummy values to overcome the validation errors - let config_string = format!( - "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'", - ); + let config_string = + format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",); let toml = config_string.parse()?; - let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) + let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( @@ -1322,9 +1281,6 @@ background_task_maximum_delay = '334 s' metric_collection_interval: humantime::parse_duration( defaults::DEFAULT_METRIC_COLLECTION_INTERVAL )?, - cached_metric_collection_interval: humantime::parse_duration( - defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL - )?, metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, metric_collection_bucket: None, synthetic_size_calculation_interval: humantime::parse_duration( @@ -1343,14 +1299,15 @@ background_task_maximum_delay = '334 s' secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), - get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant") ), - validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::default(), + compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + virtual_file_direct_io: virtual_file::DirectIoMode::default(), }, "Correct defaults should be used when no config values are provided" ); @@ -1369,7 +1326,7 @@ background_task_maximum_delay = '334 s' ); let toml = config_string.parse()?; - let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) + let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( @@ -1401,7 +1358,6 @@ background_task_maximum_delay = '334 s' eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(), metric_collection_interval: Duration::from_secs(222), - cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(333), @@ -1416,14 +1372,15 @@ background_task_maximum_delay = '334 s' secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: 100, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), - get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant") ), - validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::default(), + compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + virtual_file_direct_io: virtual_file::DirectIoMode::default(), }, "Should be able to parse all basic config values correctly" ); @@ -1458,17 +1415,18 @@ broker_endpoint = '{broker_endpoint}' let toml = config_string.parse()?; - let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for the local FS"); + let parsed_remote_storage_config = + PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) + .unwrap_or_else(|e| { + panic!("Failed to parse config '{config_string}', reason: {e:?}") + }) + .remote_storage_config + .expect("Should have remote storage config for the local FS"); assert_eq!( parsed_remote_storage_config, RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), + storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }, "Remote storage config should correctly parse the local FS config and fill other storage defaults" @@ -1519,12 +1477,13 @@ broker_endpoint = '{broker_endpoint}' let toml = config_string.parse()?; - let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for S3"); + let parsed_remote_storage_config = + PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) + .unwrap_or_else(|e| { + panic!("Failed to parse config '{config_string}', reason: {e:?}") + }) + .remote_storage_config + .expect("Should have remote storage config for S3"); assert_eq!( parsed_remote_storage_config, @@ -1546,34 +1505,6 @@ broker_endpoint = '{broker_endpoint}' Ok(()) } - #[test] - fn parse_tenant_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - - let broker_endpoint = "http://127.0.0.1:7777"; - let trace_read_requests = true; - - let config_string = format!( - r#"{ALL_BASE_VALUES_TOML} -pg_distrib_dir='{pg_distrib_dir}' -broker_endpoint = '{broker_endpoint}' - -[tenant_config] -trace_read_requests = {trace_read_requests}"#, - ); - - let toml = config_string.parse()?; - - let conf = PageServerConf::parse_and_validate(&toml, &workdir)?; - assert_eq!( - conf.default_tenant_conf.trace_read_requests, trace_read_requests, - "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants", - ); - - Ok(()) - } - #[test] fn parse_incorrect_tenant_config() -> anyhow::Result<()> { let config_string = r#" @@ -1614,7 +1545,6 @@ trace_read_requests = {trace_read_requests}"#, r#"pg_distrib_dir = "{pg_distrib_dir}" metric_collection_endpoint = "http://sample.url" metric_collection_interval = "10min" -id = 222 [disk_usage_based_eviction] max_usage_pct = 80 @@ -1631,7 +1561,7 @@ threshold = "20m" "#, ); let toml: Document = pageserver_conf_toml.parse()?; - let conf = PageServerConf::parse_and_validate(&toml, &workdir)?; + let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?; assert_eq!(conf.pg_distrib_dir, pg_distrib_dir); assert_eq!( @@ -1647,7 +1577,11 @@ threshold = "20m" .evictions_low_residence_duration_metric_threshold, Duration::from_secs(20 * 60) ); - assert_eq!(conf.id, NodeId(222)); + + // Assert that the node id provided by the indentity file (threaded + // through the call to [`PageServerConf::parse_and_validate`] is + // used. + assert_eq!(conf.id, NodeId(333)); assert_eq!( conf.disk_usage_based_eviction, Some(DiskUsageEvictionTaskConfig { @@ -1656,7 +1590,7 @@ threshold = "20m" period: Duration::from_secs(10), #[cfg(feature = "testing")] mock_statvfs: None, - eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed, + eviction_order: Default::default(), }) ); @@ -1680,7 +1614,6 @@ threshold = "20m" r#"pg_distrib_dir = "{pg_distrib_dir}" metric_collection_endpoint = "http://sample.url" metric_collection_interval = "10min" -id = 222 [tenant_config] evictions_low_residence_duration_metric_threshold = "20m" @@ -1692,7 +1625,7 @@ threshold = "20m" "#, ); let toml: Document = pageserver_conf_toml.parse().unwrap(); - let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap(); + let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap(); match &conf.default_tenant_conf.eviction_policy { EvictionPolicy::OnlyImitiate(t) => { @@ -1703,6 +1636,19 @@ threshold = "20m" } } + #[test] + fn empty_remote_storage_is_error() { + let tempdir = tempdir().unwrap(); + let (workdir, _) = prepare_fs(&tempdir).unwrap(); + let input = r#" +remote_storage = {} + "#; + let doc = toml_edit::Document::from_str(input).unwrap(); + let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir) + .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage"); + assert!(format!("{err}").contains("remote_storage"), "{err}"); + } + fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> { let tempdir_path = tempdir.path(); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 18c1a6cd9b..f94d945d46 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,5 +1,6 @@ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. +use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::size::CalculateSyntheticSizeError; @@ -39,56 +40,74 @@ type RawMetric = (MetricsKey, (EventType, u64)); /// for deduplication, but that is no longer needed. type Cache = HashMap; +pub async fn run( + conf: &'static PageServerConf, + tenant_manager: Arc, + cancel: CancellationToken, +) { + let Some(metric_collection_endpoint) = conf.metric_collection_endpoint.as_ref() else { + return; + }; + + let local_disk_storage = conf.workdir.join("last_consumption_metrics.json"); + + let metrics_ctx = RequestContext::todo_child( + TaskKind::MetricsCollection, + // This task itself shouldn't download anything. + // The actual size calculation does need downloads, and + // creates a child context with the right DownloadBehavior. + DownloadBehavior::Error, + ); + let collect_metrics = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "consumption metrics collection", + collect_metrics( + tenant_manager.clone(), + metric_collection_endpoint, + &conf.metric_collection_bucket, + conf.metric_collection_interval, + conf.id, + local_disk_storage, + cancel.clone(), + metrics_ctx, + ) + .instrument(info_span!("metrics_collection")), + )); + + let worker_ctx = + RequestContext::todo_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); + let synthetic_size_worker = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "synthetic size calculation", + calculate_synthetic_size_worker( + tenant_manager.clone(), + conf.synthetic_size_calculation_interval, + cancel.clone(), + worker_ctx, + ) + .instrument(info_span!("synthetic_size_worker")), + )); + + let (collect_metrics, synthetic_size_worker) = + futures::future::join(collect_metrics, synthetic_size_worker).await; + collect_metrics + .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process"); + synthetic_size_worker + .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process"); +} + /// Main thread that serves metrics collection #[allow(clippy::too_many_arguments)] -pub async fn collect_metrics( +async fn collect_metrics( tenant_manager: Arc, metric_collection_endpoint: &Url, metric_collection_bucket: &Option, metric_collection_interval: Duration, - _cached_metric_collection_interval: Duration, - synthetic_size_calculation_interval: Duration, node_id: NodeId, local_disk_storage: Utf8PathBuf, cancel: CancellationToken, ctx: RequestContext, ) -> anyhow::Result<()> { - if _cached_metric_collection_interval != Duration::ZERO { - tracing::warn!( - "cached_metric_collection_interval is no longer used, please set it to zero." - ) - } - - // spin up background worker that caclulates tenant sizes - let worker_ctx = - ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::CalculateSyntheticSize, - None, - None, - "synthetic size calculation", - false, - { - let tenant_manager = tenant_manager.clone(); - async move { - calculate_synthetic_size_worker( - tenant_manager, - synthetic_size_calculation_interval, - &cancel, - &worker_ctx, - ) - .instrument(info_span!("synthetic_size_worker")) - .await?; - Ok(()) - } - }, - ); - let path: Arc = Arc::new(local_disk_storage); - let cancel = task_mgr::shutdown_token(); - let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval); let mut cached_metrics = tokio::select! { @@ -103,7 +122,7 @@ pub async fn collect_metrics( .expect("Failed to create http client with timeout"); let bucket_client = if let Some(bucket_config) = metric_collection_bucket { - match GenericRemoteStorage::from_config(bucket_config) { + match GenericRemoteStorage::from_config(bucket_config).await { Ok(client) => Some(client), Err(e) => { // Non-fatal error: if we were given an invalid config, we will proceed @@ -175,11 +194,9 @@ pub async fn collect_metrics( BackgroundLoopKind::ConsumptionMetricsCollectMetrics, ); - let res = tokio::time::timeout_at( - started_at + metric_collection_interval, - task_mgr::shutdown_token().cancelled(), - ) - .await; + let res = + tokio::time::timeout_at(started_at + metric_collection_interval, cancel.cancelled()) + .await; if res.is_ok() { return Ok(()); } @@ -279,8 +296,8 @@ async fn reschedule( async fn calculate_synthetic_size_worker( tenant_manager: Arc, synthetic_size_calculation_interval: Duration, - cancel: &CancellationToken, - ctx: &RequestContext, + cancel: CancellationToken, + ctx: RequestContext, ) -> anyhow::Result<()> { info!("starting calculate_synthetic_size_worker"); scopeguard::defer! { @@ -320,7 +337,7 @@ async fn calculate_synthetic_size_worker( // there is never any reason to exit calculate_synthetic_size_worker following any // return value -- we don't need to care about shutdown because no tenant is found when // pageserver is shut down. - calculate_and_log(&tenant, cancel, ctx).await; + calculate_and_log(&tenant, &cancel, &ctx).await; } crate::tenant::tasks::warn_when_period_overrun( diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 86d0390c30..0b07e07524 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -59,6 +59,7 @@ //! 1. It should be easy to forward the context to callees. //! 2. To propagate more data from high-level to low-level code, the functions in //! the middle should not need to be modified. +//! //! The solution is to have a container structure ([`RequestContext`]) that //! carries the information. Functions that don't care about what's in it //! pass it along to callees. diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 26e7cc7ef8..b5d9267d79 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -171,14 +171,14 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { register, }; - fail::fail_point!("control-plane-client-re-attach"); - let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; tracing::info!( "Received re-attach response with {} tenants", response.tenants.len() ); + failpoint_support::sleep_millis_async!("control-plane-client-re-attach"); + Ok(response .tenants .into_iter() diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 3960fc1b99..22f7d5b824 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -382,17 +382,6 @@ pub enum DeletionQueueError { } impl DeletionQueueClient { - pub(crate) fn broken() -> Self { - // Channels whose receivers are immediately dropped. - let (tx, _rx) = tokio::sync::mpsc::unbounded_channel(); - let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1); - Self { - tx, - executor_tx, - lsn_table: Arc::default(), - } - } - /// This is cancel-safe. If you drop the future before it completes, the message /// is not pushed, although in the context of the deletion queue it doesn't matter: once /// we decide to do a deletion the decision is always final. @@ -839,9 +828,9 @@ mod test { } } - fn setup(test_name: &str) -> anyhow::Result { + async fn setup(test_name: &str) -> anyhow::Result { let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}"))); - let harness = TenantHarness::create(test_name)?; + let harness = TenantHarness::create(test_name).await?; // We do not load() the harness: we only need its config and remote_storage @@ -850,10 +839,14 @@ mod test { std::fs::create_dir_all(remote_fs_dir)?; let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?; let storage_config = RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + storage: RemoteStorageKind::LocalFs { + local_path: remote_fs_dir.clone(), + }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; - let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); + let storage = GenericRemoteStorage::from_config(&storage_config) + .await + .unwrap(); let mock_control_plane = MockControlPlane::new(); @@ -931,7 +924,9 @@ mod test { #[tokio::test] async fn deletion_queue_smoke() -> anyhow::Result<()> { // Basic test that the deletion queue processes the deletions we pass into it - let ctx = setup("deletion_queue_smoke").expect("Failed test setup"); + let ctx = setup("deletion_queue_smoke") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; @@ -1001,7 +996,9 @@ mod test { #[tokio::test] async fn deletion_queue_validation() -> anyhow::Result<()> { - let ctx = setup("deletion_queue_validation").expect("Failed test setup"); + let ctx = setup("deletion_queue_validation") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; @@ -1060,7 +1057,9 @@ mod test { #[tokio::test] async fn deletion_queue_recovery() -> anyhow::Result<()> { // Basic test that the deletion queue processes the deletions we pass into it - let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup"); + let mut ctx = setup("deletion_queue_recovery") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index bf06c78e67..d215fd2b7d 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -190,7 +190,7 @@ where } } else { // If we failed validation, then do not apply any of the projected updates - warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); + info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); } } @@ -225,7 +225,7 @@ where && (tenant.generation == *validated_generation); if !this_list_valid { - warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); + info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64); mutated = true; } else { diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 90bd4294bb..5e4a49bc56 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -59,13 +59,14 @@ use utils::{completion, id::TimelineId}; use crate::{ config::PageServerConf, metrics::disk_usage_based_eviction::METRICS, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + task_mgr::{self, BACKGROUND_RUNTIME}, tenant::{ mgr::TenantManager, remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, - storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName}, + storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint}, }, + CancellableTask, DiskUsageEvictionTask, }; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -83,17 +84,9 @@ pub struct DiskUsageEvictionTaskConfig { /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. -#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(tag = "type", content = "args")] pub enum EvictionOrder { - /// Order the layers to be evicted by how recently they have been accessed in absolute - /// time. - /// - /// This strategy is unfair when some tenants grow faster than others towards the slower - /// growing. - #[default] - AbsoluteAccessed, - /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. RelativeAccessed { @@ -108,20 +101,23 @@ pub enum EvictionOrder { }, } +impl Default for EvictionOrder { + fn default() -> Self { + Self::RelativeAccessed { + highest_layer_count_loses_first: true, + } + } +} + fn default_highest_layer_count_loses_first() -> bool { true } impl EvictionOrder { - fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) { + fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) { use EvictionOrder::*; match self { - AbsoluteAccessed => { - candidates.sort_unstable_by_key(|(partition, candidate)| { - (*partition, candidate.last_activity_ts) - }); - } RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| { (*partition, candidate.relative_last_activity) }), @@ -134,7 +130,6 @@ impl EvictionOrder { use EvictionOrder::*; match self { - AbsoluteAccessed => finite_f32::FiniteF32::ZERO, RelativeAccessed { highest_layer_count_loses_first, } => { @@ -192,36 +187,34 @@ pub fn launch_disk_usage_global_eviction_task( state: Arc, tenant_manager: Arc, background_jobs_barrier: completion::Barrier, -) -> anyhow::Result<()> { +) -> Option { let Some(task_config) = &conf.disk_usage_based_eviction else { info!("disk usage based eviction task not configured"); - return Ok(()); + return None; }; info!("launching disk usage based eviction task"); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::DiskUsageEviction, - None, - None, + let cancel = CancellationToken::new(); + let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "disk usage based eviction", - false, - async move { - let cancel = task_mgr::shutdown_token(); + { + let cancel = cancel.clone(); + async move { + // wait until initial load is complete, because we cannot evict from loading tenants. + tokio::select! { + _ = cancel.cancelled() => { return anyhow::Ok(()); }, + _ = background_jobs_barrier.wait() => { } + }; - // wait until initial load is complete, because we cannot evict from loading tenants. - tokio::select! { - _ = cancel.cancelled() => { return Ok(()); }, - _ = background_jobs_barrier.wait() => { } - }; - - disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await; - Ok(()) + disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel) + .await; + anyhow::Ok(()) + } }, - ); + )); - Ok(()) + Some(DiskUsageEvictionTask(CancellableTask { cancel, task })) } #[instrument(skip_all)] @@ -651,6 +644,7 @@ pub(crate) struct EvictionCandidate { pub(crate) layer: EvictionLayer, pub(crate) last_activity_ts: SystemTime, pub(crate) relative_last_activity: finite_f32::FiniteF32, + pub(crate) visibility: LayerVisibilityHint, } impl std::fmt::Display for EvictionLayer { @@ -692,14 +686,22 @@ impl std::fmt::Debug for EvictionCandidate { } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -enum MinResidentSizePartition { +enum EvictionPartition { + // A layer that is un-wanted by the tenant: evict all these first, before considering + // any other layers + EvictNow, + + // Above the minimum size threshold: this layer is a candidate for eviction. Above, + + // Below the minimum size threshold: this layer should only be evicted if all the + // tenants' layers above the minimum size threshold have already been considered. Below, } enum EvictionCandidates { Cancelled, - Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>), + Finished(Vec<(EvictionPartition, EvictionCandidate)>), } /// Gather the eviction candidates. @@ -897,8 +899,10 @@ async fn collect_eviction_candidates( max_layer_size }; - // Sort layers most-recently-used first, then partition by - // cumsum above/below min_resident_size. + // Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer, + // where the inputs are: + // - whether the layer is visible + // - whether the layer is above/below the min_resident_size cutline tenant_candidates .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); let mut cumsum: i128 = 0; @@ -915,12 +919,23 @@ async fn collect_eviction_candidates( candidate.relative_last_activity = eviction_order.relative_last_activity(total, i); - let partition = if cumsum > min_resident_size as i128 { - MinResidentSizePartition::Above - } else { - MinResidentSizePartition::Below + let partition = match candidate.visibility { + LayerVisibilityHint::Covered => { + // Covered layers are evicted first + EvictionPartition::EvictNow + } + LayerVisibilityHint::Visible => { + cumsum += i128::from(candidate.layer.get_file_size()); + + if cumsum > min_resident_size as i128 { + EvictionPartition::Above + } else { + // The most recent layers below the min_resident_size threshold + // are the last to be evicted. + EvictionPartition::Below + } + } }; - cumsum += i128::from(candidate.layer.get_file_size()); (partition, candidate) }); @@ -988,7 +1003,7 @@ async fn collect_eviction_candidates( // Secondary locations' layers are always considered above the min resident size, // i.e. secondary locations are permitted to be trimmed to zero layers if all // the layers have sufficiently old access times. - MinResidentSizePartition::Above, + EvictionPartition::Above, candidate, ) }); @@ -1016,7 +1031,9 @@ async fn collect_eviction_candidates( } } - debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, + debug_assert!(EvictionPartition::Above < EvictionPartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); + debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); eviction_order.sort(&mut candidates); @@ -1029,7 +1046,7 @@ async fn collect_eviction_candidates( /// /// Returns the amount of candidates selected, with the planned usage. fn select_victims( - candidates: &[(MinResidentSizePartition, EvictionCandidate)], + candidates: &[(EvictionPartition, EvictionCandidate)], usage_pre: U, ) -> VictimSelection { let mut usage_when_switched = None; @@ -1041,7 +1058,7 @@ fn select_victims( break; } - if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() { + if partition == &EvictionPartition::Below && usage_when_switched.is_none() { usage_when_switched = Some((usage_planned, i)); } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 71b486a4d3..42086dc2e6 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -78,29 +78,14 @@ paths: delete: description: | - Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved. - 404 means that deletion successfully finished" + Attempts to delete specified tenant. 500, 503 and 409 errors should be retried. Deleting + a non-existent tenant is considered successful (returns 200). responses: "200": description: Tenant was successfully deleted, or was already not found. - "404": - description: Tenant not found. This is a success result, equivalent to 200. - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "409": - description: Deletion is already in progress, continue polling - content: - application/json: - schema: - $ref: "#/components/schemas/ConflictError" - "412": - description: Deletion may not proceed, tenant is not in Active state - content: - application/json: - schema: - $ref: "#/components/schemas/PreconditionFailedError" + "503": + description: Service is unavailable, or tenant is already being modified (perhaps concurrently deleted) + /v1/tenant/{tenant_id}/time_travel_remote_storage: parameters: @@ -251,6 +236,13 @@ paths: type: string format: date-time description: A timestamp to get the LSN + - name: with_lease + in: query + required: false + schema: + type: boolean + description: Whether to grant a lease to the corresponding LSN. Default to false. + responses: "200": description: OK @@ -273,15 +265,19 @@ paths: type: string format: hex post: - description: Obtain lease for the given LSN - parameters: - - name: lsn - in: query - required: true - schema: - type: string - format: hex - description: A LSN to obtain the lease for + description: Obtains a lease for the given LSN. + requestBody: + content: + application/json: + schema: + type: object + required: + - lsn + properties: + lsn: + description: A LSN to obtain the lease for. + type: string + format: hex responses: "200": description: OK @@ -312,6 +308,45 @@ paths: application/json: schema: type: string + + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Persistently add a gc blocking at the tenant level because of this timeline + responses: + "200": + description: OK + + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Persistently remove a tenant level gc blocking for this timeline + responses: + "200": + description: OK + /v1/tenant/{tenant_shard_id}/location_config: parameters: - name: tenant_shard_id @@ -375,64 +410,13 @@ paths: $ref: "#/components/schemas/TenantLocationConfigResponse" "409": description: | - The tenant is already known to Pageserver in some way, - and hence this `/attach` call has been rejected. - - Some examples of how this can happen: - - tenant was created on this pageserver - - tenant attachment was started by an earlier call to `/attach`. - - Callers should poll the tenant status's `attachment_status` field, - like for status 202. See the longer description for `POST /attach` - for details. + The tenant is already being modified, perhaps by a concurrent call to this API content: application/json: schema: $ref: "#/components/schemas/ConflictError" - /v1/tenant/{tenant_id}/ignore: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - post: - description: | - Remove tenant data (including all corresponding timelines) from pageserver's memory. - Files on local disk and remote storage are not affected. - Future pageserver restarts won't load the data back until `load` is called on such tenant. - responses: - "200": - description: Tenant ignored - - - /v1/tenant/{tenant_id}/load: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - post: - description: | - Schedules an operation that attempts to load a tenant from the local disk and - synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load. - If the tenant was ignored before, removes the ignore mark and continues with load scheduling. - - Errors if the tenant is absent on disk, already present in memory or fails to schedule its load. - Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness. - requestBody: - required: false - content: - application/json: - schema: - $ref: "#/components/schemas/TenantLoadRequest" - responses: - "202": - description: Tenant scheduled to load successfully - - /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive: + /v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive: parameters: - name: tenant_id in: path @@ -452,6 +436,51 @@ paths: "202": description: Tenant scheduled to load successfully + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + put: + description: | + Either archives or unarchives the given timeline. + An archived timeline may not have any non-archived children. + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/ArchivalConfigRequest" + responses: + "200": + description: Timeline (un)archived successfully + "409": + description: | + The tenant/timeline is already being modified, perhaps by a concurrent call to this API + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "503": + description: Temporarily unavailable, please retry. + content: + application/json: + schema: + $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/{tenant_id}/synthetic_size: parameters: - name: tenant_id @@ -484,7 +513,9 @@ paths: schema: $ref: "#/components/schemas/SyntheticSizeResponse" text/html: - description: SVG representation of the tenant and it's timelines. + schema: + type: string + description: SVG representation of the tenant and its timelines. "401": description: Unauthorized Error content: @@ -623,7 +654,7 @@ paths: type: string - name: timeline_id in: path - Å•equired: true + required: true schema: type: string @@ -812,8 +843,6 @@ components: For example this can be caused by s3 being unreachable. The retry may be implemented with call to detach, though it would be better to not automate it and inspec failed state manually before proceeding with a retry. - - See the tenant `/attach` endpoint for more information. type: object required: - slug @@ -831,15 +860,13 @@ components: TenantCreateRequest: allOf: - $ref: '#/components/schemas/TenantConfig' + - $ref: '#/components/schemas/TenantLoadRequest' - type: object required: - new_tenant_id properties: new_tenant_id: type: string - generation: - type: integer - description: Attachment generation number. TenantLoadRequest: type: object properties: @@ -903,6 +930,15 @@ components: warm: type: boolean description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything. + ArchivalConfigRequest: + type: object + required: + - state + properties: + state: + description: The archival state of a timeline + type: string + enum: ["Archived", "Unarchived"] TenantConfig: type: object properties: @@ -930,8 +966,6 @@ components: type: string max_lsn_wal_lag: type: integer - trace_read_requests: - type: boolean heatmap_period: type: string TenantConfigResponse: @@ -1086,6 +1120,10 @@ components: kind: type: string enum: [past, present, future, nodata] + valid_until: + type: string + format: date-time + description: The expiration time of the granted lease. LsnLease: type: object @@ -1161,7 +1199,7 @@ components: reparented_timelines: type: array description: Set of reparented timeline ids - properties: + items: type: string format: hex description: TimelineId diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 482879630a..a4da8506d6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -10,6 +10,7 @@ use std::time::Duration; use anyhow::{anyhow, Context, Result}; use enumset::EnumSet; +use futures::StreamExt; use futures::TryFutureExt; use humantime::format_rfc3339; use hyper::header; @@ -17,12 +18,17 @@ use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; use pageserver_api::models::IngestAuxFilesRequest; use pageserver_api::models::ListAuxFilesRequest; use pageserver_api::models::LocationConfig; use pageserver_api::models::LocationConfigListResponse; +use pageserver_api::models::LocationConfigMode; +use pageserver_api::models::LsnLease; +use pageserver_api::models::LsnLeaseRequest; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; +use pageserver_api::models::TenantLocationConfigRequest; use pageserver_api::models::TenantLocationConfigResponse; use pageserver_api::models::TenantScanRemoteStorageResponse; use pageserver_api::models::TenantScanRemoteStorageShard; @@ -30,32 +36,28 @@ use pageserver_api::models::TenantShardLocation; use pageserver_api::models::TenantShardSplitRequest; use pageserver_api::models::TenantShardSplitResponse; use pageserver_api::models::TenantSorting; -use pageserver_api::models::TenantState; +use pageserver_api::models::TimelineArchivalConfigRequest; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::TopTenantShardsRequest; use pageserver_api::models::TopTenantShardsResponse; -use pageserver_api::models::{ - DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest, - TenantLoadRequest, TenantLocationConfigRequest, -}; use pageserver_api::shard::ShardCount; use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeTravelError; -use tenant_size_model::{SizeResult, StorageModel}; +use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel}; +use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; use utils::http::endpoint::prometheus_metrics_handler; use utils::http::endpoint::request_span; -use utils::http::json::json_request_or_empty_body; +use utils::http::request::must_parse_query_param; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; -use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; @@ -77,13 +79,12 @@ use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; use crate::tenant::GetTimelineError; -use crate::tenant::SpawnMode; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ - StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, - TimelineCreateRequest, TimelineGcRequest, TimelineInfo, + StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, + TimelineInfo, }; use utils::{ auth::SwappableJwtAuth, @@ -177,10 +178,8 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res impl From for ApiError { fn from(pre: PageReconstructError) -> ApiError { match pre { - PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), - PageReconstructError::MissingKey(e) => { - ApiError::InternalServerError(anyhow::anyhow!("{e}")) - } + PageReconstructError::Other(other) => ApiError::InternalServerError(other), + PageReconstructError::MissingKey(e) => ApiError::InternalServerError(e.into()), PageReconstructError::Cancelled => ApiError::Cancelled, PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()), PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre), @@ -205,7 +204,6 @@ impl From for ApiError { NotFound(tenant_id) => { ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into()) } - e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")), InProgress => { ApiError::ResourceUnavailable("Tenant is being modified concurrently".into()) } @@ -232,7 +230,7 @@ impl From for ApiError { BadRequest(e) => ApiError::BadRequest(e), Unavailable(_) => ApiError::ShuttingDown, e @ InProgress => ApiError::Conflict(format!("{e}")), - Flush(e) | Other(e) => ApiError::InternalServerError(e), + Flush(e) | InternalError(e) => ApiError::InternalServerError(e), } } } @@ -296,6 +294,11 @@ impl From for ApiError { GetActiveTenantError::WaitForActiveTimeout { .. } => { ApiError::ResourceUnavailable(format!("{}", e).into()) } + GetActiveTenantError::SwitchedTenant => { + // in our HTTP handlers, this error doesn't happen + // TODO: separate error types + ApiError::ResourceUnavailable("switched tenant".into()) + } } } } @@ -330,18 +333,12 @@ impl From for ApiError { } } -impl From for ApiError { - fn from(value: crate::tenant::delete::DeleteTenantError) -> Self { - use crate::tenant::delete::DeleteTenantError::*; +impl From for ApiError { + fn from(value: crate::tenant::mgr::DeleteTenantError) -> Self { + use crate::tenant::mgr::DeleteTenantError::*; match value { - Get(g) => ApiError::from(g), - e @ AlreadyInProgress => ApiError::Conflict(e.to_string()), - Timeline(t) => ApiError::from(t), - NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()), SlotError(e) => e.into(), - SlotUpsertError(e) => e.into(), Other(o) => ApiError::InternalServerError(o), - e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()), Cancelled => ApiError::ShuttingDown, } } @@ -417,6 +414,8 @@ async fn build_timeline_info_common( let walreceiver_status = timeline.walreceiver_status(); + let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats(); + let info = TimelineInfo { tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, @@ -437,6 +436,8 @@ async fn build_timeline_info_common( directory_entries_counts: timeline.get_directory_metrics().to_vec(), current_physical_size, current_logical_size_non_incremental: None, + pitr_history_size, + within_ancestor_pitr, timeline_dir_layer_file_size_sum: None, wal_source_connstr, last_received_msg_lsn, @@ -667,6 +668,39 @@ async fn timeline_preserve_initdb_handler( json_response(StatusCode::OK, ()) } +async fn timeline_archival_config_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant + .apply_timeline_archival_config(timeline_id, request_data.state) + .await + .context("applying archival config") + .map_err(ApiError::InternalServerError)?; + Ok::<_, ApiError>(()) + } + .instrument(info_span!("timeline_archival_config", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + state = ?request_data.state, + %timeline_id)) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, @@ -735,6 +769,8 @@ async fn get_lsn_by_timestamp_handler( .map_err(ApiError::BadRequest)?; let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); + let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = @@ -743,10 +779,15 @@ async fn get_lsn_by_timestamp_handler( let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; + #[derive(serde::Serialize, Debug)] struct Result { lsn: Lsn, kind: &'static str, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(flatten)] + lease: Option, } let (lsn, kind) = match result { LsnForTimestamp::Present(lsn) => (lsn, "present"), @@ -754,11 +795,28 @@ async fn get_lsn_by_timestamp_handler( LsnForTimestamp::Past(lsn) => (lsn, "past"), LsnForTimestamp::NoData(lsn) => (lsn, "nodata"), }; - let result = Result { lsn, kind }; + + let lease = if with_lease { + timeline + .make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx) + .inspect_err(|_| { + warn!("fail to grant a lease to {}", lsn); + }) + .ok() + } else { + None + }; + + let result = Result { lsn, kind, lease }; + let valid_until = result + .lease + .as_ref() + .map(|l| humantime::format_rfc3339_millis(l.valid_until).to_string()); tracing::info!( lsn=?result.lsn, kind=%result.kind, timestamp=%timestamp_raw, + valid_until=?valid_until, "lsn_by_timestamp finished" ); json_response(StatusCode::OK, result) @@ -803,58 +861,6 @@ async fn get_timestamp_of_lsn_handler( } } -async fn tenant_attach_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let maybe_body: Option = json_request_or_empty_body(&mut request).await?; - let tenant_conf = match &maybe_body { - Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?, - None => TenantConfOpt::default(), - }; - - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - - info!("Handling tenant attach {tenant_id}"); - - let state = get_state(&request); - - let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; - - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - let shard_params = ShardParameters::default(); - let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params); - - let tenant = state - .tenant_manager - .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx) - .await?; - - let Some(tenant) = tenant else { - // This should never happen: indicates a bug in upsert_location - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Upsert succeeded but didn't return tenant!" - ))); - }; - - // We might have successfully constructed a Tenant, but it could still - // end up in a broken state: - if let TenantState::Broken { - reason, - backtrace: _, - } = tenant.current_state() - { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Tenant state is Broken: {reason}" - ))); - } - - json_response(StatusCode::ACCEPTED, ()) -} - async fn timeline_delete_handler( request: Request, _cancel: CancellationToken, @@ -885,33 +891,6 @@ async fn timeline_delete_handler( json_response(StatusCode::ACCEPTED, ()) } -async fn tenant_detach_handler( - request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - let detach_ignored: Option = parse_query_param(&request, "detach_ignored")?; - - // This is a legacy API (`/location_conf` is the replacement). It only supports unsharded tenants - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - let state = get_state(&request); - let conf = state.conf; - state - .tenant_manager - .detach_tenant( - conf, - tenant_shard_id, - detach_ignored.unwrap_or(false), - &state.deletion_queue_client, - ) - .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug())) - .await?; - - json_response(StatusCode::OK, ()) -} - async fn tenant_reset_handler( request: Request, _cancel: CancellationToken, @@ -932,54 +911,6 @@ async fn tenant_reset_handler( json_response(StatusCode::OK, ()) } -async fn tenant_load_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - - let maybe_body: Option = json_request_or_empty_body(&mut request).await?; - - let state = get_state(&request); - - // The /load request is only usable when control_plane_api is not set. Once it is set, callers - // should always use /attach instead. - let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; - - mgr::load_tenant( - state.conf, - tenant_id, - generation, - state.broker_client.clone(), - state.remote_storage.clone(), - state.deletion_queue_client.clone(), - &ctx, - ) - .instrument(info_span!("load", %tenant_id)) - .await?; - - json_response(StatusCode::ACCEPTED, ()) -} - -async fn tenant_ignore_handler( - request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let state = get_state(&request); - let conf = state.conf; - mgr::ignore_tenant(conf, tenant_id) - .instrument(info_span!("ignore_tenant", %tenant_id)) - .await?; - - json_response(StatusCode::OK, ()) -} - async fn tenant_list_handler( request: Request, _cancel: CancellationToken, @@ -999,7 +930,10 @@ async fn tenant_list_handler( state: state.clone(), current_physical_size: None, attachment_status: state.attachment_status(), - generation: (*gen).into(), + generation: (*gen) + .into() + .expect("Tenants are always attached with a generation"), + gc_blocking: None, }) .collect::>(); @@ -1047,7 +981,11 @@ async fn tenant_status( state: state.clone(), current_physical_size: Some(current_physical_size), attachment_status: state.attachment_status(), - generation: tenant.generation().into(), + generation: tenant + .generation() + .into() + .expect("Tenants are always attached with a generation"), + gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")), }, walredo: tenant.wal_redo_manager_status(), timelines: tenant.list_timeline_ids(), @@ -1071,23 +1009,16 @@ async fn tenant_delete_handler( let state = get_state(&request); - let status = state + state .tenant_manager - .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT) + .delete_tenant(tenant_shard_id) .instrument(info_span!("tenant_delete_handler", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug() )) .await?; - // Callers use 404 as success for deletions, for historical reasons. - if status == StatusCode::NOT_FOUND { - return Err(ApiError::NotFound( - anyhow::anyhow!("Deletion complete").into(), - )); - } - - json_response(status, ()) + json_response(StatusCode::OK, ()) } /// HTTP endpoint to query the current tenant_size of a tenant. @@ -1229,7 +1160,10 @@ async fn layer_map_info_handler( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - let layer_map_info = timeline.layer_map_info(reset).await; + let layer_map_info = timeline + .layer_map_info(reset) + .await + .map_err(|_shutdown| ApiError::ShuttingDown)?; json_response(StatusCode::OK, layer_map_info) } @@ -1295,6 +1229,72 @@ async fn evict_timeline_layer_handler( } } +async fn timeline_gc_blocking_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + block_or_unblock_gc(request, true).await +} + +async fn timeline_gc_unblocking_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + block_or_unblock_gc(request, false).await +} + +/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`. +/// +/// Both are technically unsafe because they might fire off index uploads, thus they are POST. +async fn block_or_unblock_gc( + request: Request, + block: bool, +) -> Result, ApiError> { + use crate::tenant::{ + remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized, + }; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let timeline = tenant.get_timeline(timeline_id, true)?; + + let fut = async { + if block { + timeline.block_gc(&tenant).await.map(|_| ()) + } else { + timeline.unblock_gc(&tenant).await + } + }; + + let span = tracing::info_span!( + "block_or_unblock_gc", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + timeline_id = %timeline_id, + block = block, + ); + + let res = fut.instrument(span).await; + + res.map_err(|e| { + if e.is::() || e.is::() { + ApiError::ShuttingDown + } else { + ApiError::InternalServerError(e) + } + })?; + + json_response(StatusCode::OK, ()) +} + /// Get tenant_size SVG graph along with the JSON data. fn synthetic_size_html_response( inputs: ModelInputs, @@ -1307,10 +1307,15 @@ fn synthetic_size_html_response( timeline_map.insert(ti.timeline_id, index); timeline_ids.push(ti.timeline_id.to_string()); } - let seg_to_branch: Vec = inputs + let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs .segments .iter() - .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap()) + .map(|seg| { + ( + *timeline_map.get(&seg.timeline_id).unwrap(), + seg.kind.into(), + ) + }) .collect(); let svg = @@ -1351,75 +1356,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result, Ok(response) } -/// Helper for requests that may take a generation, which is mandatory -/// when control_plane_api is set, but otherwise defaults to Generation::none() -fn get_request_generation(state: &State, req_gen: Option) -> Result { - if state.conf.control_plane_api.is_some() { - req_gen - .map(Generation::new) - .ok_or(ApiError::BadRequest(anyhow!( - "generation attribute missing" - ))) - } else { - // Legacy mode: all tenants operate with no generation - Ok(Generation::none()) - } -} - -async fn tenant_create_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let request_data: TenantCreateRequest = json_request(&mut request).await?; - let target_tenant_id = request_data.new_tenant_id; - check_permission(&request, None)?; - - let _timer = STORAGE_TIME_GLOBAL - .get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()]) - .expect("bug") - .start_timer(); - - let tenant_conf = - TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; - - let state = get_state(&request); - - let generation = get_request_generation(state, request_data.generation)?; - - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - - let location_conf = - LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters); - - let new_tenant = state - .tenant_manager - .upsert_location( - target_tenant_id, - location_conf, - None, - SpawnMode::Create, - &ctx, - ) - .await?; - - let Some(new_tenant) = new_tenant else { - // This should never happen: indicates a bug in upsert_location - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Upsert succeeded but didn't return tenant!" - ))); - }; - // We created the tenant. Existing API semantics are that the tenant - // is Active when this function returns. - new_tenant - .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) - .await?; - - json_response( - StatusCode::CREATED, - TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id), - ) -} - async fn get_tenant_config_handler( request: Request, _cancel: CancellationToken, @@ -1481,7 +1417,7 @@ async fn update_tenant_config_handler( crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; tenant.set_new_tenant_config(new_tenant_conf); json_response(StatusCode::OK, ()) @@ -1507,7 +1443,7 @@ async fn put_tenant_location_config_handler( if let LocationConfigMode::Detached = request_data.config.mode { if let Err(e) = state .tenant_manager - .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client) + .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client) .instrument(info_span!("tenant_detach", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug() @@ -1712,15 +1648,13 @@ async fn handle_tenant_break( // Obtains an lsn lease on the given timeline. async fn lsn_lease_handler( - request: Request, + mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - - let lsn: Lsn = parse_query_param(&request, "lsn")? - .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?; + let lsn = json_request::(&mut request).await?.lsn; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); @@ -1771,6 +1705,14 @@ async fn timeline_compact_handler( if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { flags |= CompactFlags::ForceImageLayerCreation; } + if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? { + if !cfg!(feature = "testing") { + return Err(ApiError::InternalServerError(anyhow!( + "enhanced_gc_bottom_most_compaction is only available in testing mode" + ))); + } + flags |= CompactFlags::EnhancedGcBottomMostCompaction; + } let wait_until_uploaded = parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); @@ -1782,7 +1724,9 @@ async fn timeline_compact_handler( .await .map_err(|e| ApiError::InternalServerError(e.into()))?; if wait_until_uploaded { - timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + timeline.remote_client.wait_completion().await + // XXX map to correct ApiError for the cases where it's due to shutdown + .context("wait completion").map_err(ApiError::InternalServerError)?; } json_response(StatusCode::OK, ()) } @@ -1808,6 +1752,10 @@ async fn timeline_checkpoint_handler( if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { flags |= CompactFlags::ForceImageLayerCreation; } + + // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload. + let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true); + let wait_until_uploaded = parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); @@ -1824,18 +1772,24 @@ async fn timeline_checkpoint_handler( } })?; - timeline - .compact(&cancel, flags, &ctx) - .await - .map_err(|e| - match e { - CompactionError::ShuttingDown => ApiError::ShuttingDown, - CompactionError::Other(e) => ApiError::InternalServerError(e) - } - )?; + if compact { + timeline + .compact(&cancel, flags, &ctx) + .await + .map_err(|e| + match e { + CompactionError::ShuttingDown => ApiError::ShuttingDown, + CompactionError::Other(e) => ApiError::InternalServerError(e) + } + )?; + } if wait_until_uploaded { - timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + tracing::info!("Waiting for uploads to complete..."); + timeline.remote_client.wait_completion().await + // XXX map to correct ApiError for the cases where it's due to shutdown + .context("wait completion").map_err(ApiError::InternalServerError)?; + tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0))); } json_response(StatusCode::OK, ()) @@ -1887,7 +1841,9 @@ async fn timeline_detach_ancestor_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - use crate::tenant::timeline::detach_ancestor::Options; + use crate::tenant::timeline::detach_ancestor; + use pageserver_api::models::detach_ancestor::AncestorDetached; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -1895,7 +1851,7 @@ async fn timeline_detach_ancestor_handler( let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); async move { - let mut options = Options::default(); + let mut options = detach_ancestor::Options::default(); let rewrite_concurrency = parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?; @@ -1923,27 +1879,35 @@ async fn timeline_detach_ancestor_handler( let timeline = tenant.get_timeline(timeline_id, true)?; - let (_guard, prepared) = timeline + let progress = timeline .prepare_to_detach_from_ancestor(&tenant, options, ctx) .await?; - let res = state - .tenant_manager - .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx) - .await; + // uncomment to allow early as possible Tenant::drop + // drop(tenant); - match res { - Ok(reparented_timelines) => { - let resp = pageserver_api::models::detach_ancestor::AncestorDetached { + let resp = match progress { + detach_ancestor::Progress::Prepared(attempt, prepared) => { + // it would be great to tag the guard on to the tenant activation future + let reparented_timelines = state + .tenant_manager + .complete_detaching_timeline_ancestor( + tenant_shard_id, + timeline_id, + prepared, + attempt, + ctx, + ) + .await?; + + AncestorDetached { reparented_timelines, - }; - - json_response(StatusCode::OK, resp) + } } - Err(e) => Err(ApiError::InternalServerError( - e.context("timeline detach completion"), - )), - } + detach_ancestor::Progress::Done(resp) => resp, + }; + + json_response(StatusCode::OK, resp) } .instrument(span) .await @@ -2240,14 +2204,24 @@ async fn secondary_download_handler( let timeout = wait.unwrap_or(Duration::MAX); - let status = match tokio::time::timeout( + let result = tokio::time::timeout( timeout, state.secondary_controller.download_tenant(tenant_shard_id), ) - .await - { - // Download job ran to completion. - Ok(Ok(())) => StatusCode::OK, + .await; + + let progress = secondary_tenant.progress.lock().unwrap().clone(); + + let status = match result { + Ok(Ok(())) => { + if progress.layers_downloaded >= progress.layers_total { + // Download job ran to completion + StatusCode::OK + } else { + // Download dropped out without errors because it ran out of time budget + StatusCode::ACCEPTED + } + } // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered // okay. We could get an error here in the unlikely edge case that the tenant // was detached between our check above and executing the download job. @@ -2257,8 +2231,6 @@ async fn secondary_download_handler( Err(_) => StatusCode::ACCEPTED, }; - let progress = secondary_tenant.progress.lock().unwrap().clone(); - json_response(status, progress) } @@ -2384,8 +2356,9 @@ async fn get_utilization( // regenerate at most 1Hz to allow polling at any rate. if !still_valid { let path = state.conf.tenants_path(); - let doc = crate::utilization::regenerate(path.as_std_path()) - .map_err(ApiError::InternalServerError)?; + let doc = + crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager) + .map_err(ApiError::InternalServerError)?; let mut buf = Vec::new(); serde_json::to_writer(&mut buf, &doc) @@ -2573,6 +2546,189 @@ async fn post_top_tenants( ) } +async fn put_tenant_timeline_import_basebackup( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?; + let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?; + let pg_version: u32 = must_parse_query_param(&request, "pg_version")?; + + check_permission(&request, Some(tenant_id))?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + + let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version); + async move { + let state = get_state(&request); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?; + + let broker_client = state.broker_client.clone(); + + let mut body = StreamReader::new(request.into_body().map(|res| { + res.map_err(|error| { + std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error)) + }) + })); + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let timeline = tenant + .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) + .map_err(ApiError::InternalServerError) + .await?; + + // TODO mark timeline as not ready until it reaches end_lsn. + // We might have some wal to import as well, and we should prevent compute + // from connecting before that and writing conflicting wal. + // + // This is not relevant for pageserver->pageserver migrations, since there's + // no wal to import. But should be fixed if we want to import from postgres. + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import basebackup provided via CopyData + info!("importing basebackup"); + + timeline + .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx) + .await + .map_err(ApiError::InternalServerError)?; + + // Read the end of the tar archive. + read_tar_eof(body) + .await + .map_err(ApiError::InternalServerError)?; + + // TODO check checksum + // Meanwhile you can verify client-side by taking fullbackup + // and checking that it matches in size with what was imported. + // It wouldn't work if base came from vanilla postgres though, + // since we discard some log files. + + info!("done"); + json_response(StatusCode::OK, ()) + } + .instrument(span) + .await +} + +async fn put_tenant_timeline_import_wal( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?; + let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?; + + check_permission(&request, Some(tenant_id))?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + + let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn); + async move { + let state = get_state(&request); + + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?; + + let mut body = StreamReader::new(request.into_body().map(|res| { + res.map_err(|error| { + std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error)) + }) + })); + + let last_record_lsn = timeline.get_last_record_lsn(); + if last_record_lsn != start_lsn { + return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))); + } + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import wal provided via CopyData + info!("importing wal"); + crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?; + info!("wal import complete"); + + // Read the end of the tar archive. + read_tar_eof(body).await.map_err(ApiError::InternalServerError)?; + + // TODO Does it make sense to overshoot? + if timeline.get_last_record_lsn() < end_lsn { + return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))); + } + + // Flush data to disk, then upload to s3. No need for a forced checkpoint. + // We only want to persist the data, and it doesn't matter if it's in the + // shape of deltas or images. + info!("flushing layers"); + timeline.freeze_and_flush().await.map_err(|e| match e { + tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown, + other => ApiError::InternalServerError(anyhow::anyhow!(other)), + })?; + + info!("done"); + + json_response(StatusCode::OK, ()) + }.instrument(span).await +} + +/// Read the end of a tar archive. +/// +/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. +/// `tokio_tar` already read the first such block. Read the second all-zeros block, +/// and check that there is no more data after the EOF marker. +/// +/// 'tar' command can also write extra blocks of zeros, up to a record +/// size, controlled by the --record-size argument. Ignore them too. +async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> { + use tokio::io::AsyncReadExt; + let mut buf = [0u8; 512]; + + // Read the all-zeros block, and verify it + let mut total_bytes = 0; + while total_bytes < 512 { + let nbytes = reader.read(&mut buf[total_bytes..]).await?; + total_bytes += nbytes; + if nbytes == 0 { + break; + } + } + if total_bytes < 512 { + anyhow::bail!("incomplete or invalid tar EOF marker"); + } + if !buf.iter().all(|&x| x == 0) { + anyhow::bail!("invalid tar EOF marker"); + } + + // Drain any extra zero-blocks after the EOF marker + let mut trailing_bytes = 0; + let mut seen_nonzero_bytes = false; + loop { + let nbytes = reader.read(&mut buf).await?; + trailing_bytes += nbytes; + if !buf.iter().all(|&x| x == 0) { + seen_nonzero_bytes = true; + } + if nbytes == 0 { + break; + } + } + if seen_nonzero_bytes { + anyhow::bail!("unexpected non-zero bytes after the tar archive"); + } + if trailing_bytes % 512 != 0 { + anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"); + } + Ok(()) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -2717,7 +2873,6 @@ pub fn make_router( api_handler(r, reload_auth_validation_keys_handler) }) .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) - .post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) .get("/v1/tenant/:tenant_shard_id", |r| { api_handler(r, tenant_status) }) @@ -2755,25 +2910,17 @@ pub fn make_router( .post("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_create_handler) }) - .post("/v1/tenant/:tenant_id/attach", |r| { - api_handler(r, tenant_attach_handler) - }) - .post("/v1/tenant/:tenant_id/detach", |r| { - api_handler(r, tenant_detach_handler) - }) .post("/v1/tenant/:tenant_shard_id/reset", |r| { api_handler(r, tenant_reset_handler) }) - .post("/v1/tenant/:tenant_id/load", |r| { - api_handler(r, tenant_load_handler) - }) - .post("/v1/tenant/:tenant_id/ignore", |r| { - api_handler(r, tenant_ignore_handler) - }) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive", |r| api_handler(r, timeline_preserve_initdb_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config", + |r| api_handler(r, timeline_archival_config_handler), + ) .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_detail_handler) }) @@ -2828,6 +2975,14 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, evict_timeline_layer_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc", + |r| api_handler(r, timeline_gc_blocking_handler), + ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc", + |r| api_handler(r, timeline_gc_unblocking_handler), + ) .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { api_handler(r, secondary_upload_handler) }) @@ -2880,5 +3035,13 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info", |r| testing_api_handler("perf_info", r, perf_info), ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup", + |r| api_handler(r, put_tenant_timeline_import_basebackup), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal", + |r| api_handler(r, put_tenant_timeline_import_wal), + ) .any(handler_404)) } diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs new file mode 100644 index 0000000000..313a7961a6 --- /dev/null +++ b/pageserver/src/l0_flush.rs @@ -0,0 +1,39 @@ +use std::{num::NonZeroUsize, sync::Arc}; + +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +pub enum L0FlushConfig { + #[serde(rename_all = "snake_case")] + Direct { max_concurrency: NonZeroUsize }, +} + +impl Default for L0FlushConfig { + fn default() -> Self { + Self::Direct { + // TODO: using num_cpus results in different peak memory usage on different instance types. + max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(), + } + } +} + +#[derive(Clone)] +pub struct L0FlushGlobalState(Arc); + +pub enum Inner { + Direct { semaphore: tokio::sync::Semaphore }, +} + +impl L0FlushGlobalState { + pub fn new(config: L0FlushConfig) -> Self { + match config { + L0FlushConfig::Direct { max_concurrency } => { + let semaphore = tokio::sync::Semaphore::new(max_concurrency.get()); + Self(Arc::new(Inner::Direct { semaphore })) + } + } + } + + pub fn inner(&self) -> &Arc { + &self.0 + } +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index c69fb8c83b..dbfc9f3544 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -11,7 +11,11 @@ pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; +pub mod l0_flush; + +use futures::{stream::FuturesUnordered, StreamExt}; pub use pageserver_api::keyspace; +use tokio_util::sync::CancellationToken; pub mod aux_file; pub mod metrics; pub mod page_cache; @@ -22,18 +26,19 @@ pub mod span; pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; -pub mod trace; pub mod utilization; pub mod virtual_file; pub mod walingest; pub mod walrecord; pub mod walredo; -use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; -use tenant::mgr::TenantManager; -use tracing::info; +use tenant::{ + mgr::{BackgroundPurges, TenantManager}, + secondary, +}; +use tracing::{info, info_span}; /// Current storage format version /// @@ -44,7 +49,7 @@ use tracing::info; /// backwards-compatible changes to the metadata format. pub const STORAGE_FORMAT_VERSION: u16 = 3; -pub const DEFAULT_PG_VERSION: u32 = 15; +pub const DEFAULT_PG_VERSION: u32 = 16; // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; @@ -54,17 +59,113 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; +pub struct CancellableTask { + pub task: tokio::task::JoinHandle<()>, + pub cancel: CancellationToken, +} +pub struct HttpEndpointListener(pub CancellableTask); +pub struct ConsumptionMetricsTasks(pub CancellableTask); +pub struct DiskUsageEvictionTask(pub CancellableTask); +impl CancellableTask { + pub async fn shutdown(self) { + self.cancel.cancel(); + self.task.await.unwrap(); + } +} + #[tracing::instrument(skip_all, fields(%exit_code))] +#[allow(clippy::too_many_arguments)] pub async fn shutdown_pageserver( + http_listener: HttpEndpointListener, + page_service: page_service::Listener, + consumption_metrics_worker: ConsumptionMetricsTasks, + disk_usage_eviction_task: Option, tenant_manager: &TenantManager, + background_purges: BackgroundPurges, mut deletion_queue: DeletionQueue, + secondary_controller_tasks: secondary::GlobalTasks, exit_code: i32, ) { use std::time::Duration; + + let started_at = std::time::Instant::now(); + + // If the orderly shutdown below takes too long, we still want to make + // sure that all walredo processes are killed and wait()ed on by us, not systemd. + // + // (Leftover walredo processes are the hypothesized trigger for the systemd freezes + // that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387. + // + // We use a thread instead of a tokio task because the background runtime is likely busy + // with the final flushing / uploads. This activity here has priority, and due to lack + // of scheduling priority feature sin the tokio scheduler, using a separate thread is + // an effective priority booster. + let walredo_extraordinary_shutdown_thread_span = { + let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread"); + span.follows_from(tracing::Span::current()); + span + }; + let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new(); + let walredo_extraordinary_shutdown_thread = std::thread::spawn({ + let walredo_extraordinary_shutdown_thread_cancel = + walredo_extraordinary_shutdown_thread_cancel.clone(); + move || { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let _entered = rt.enter(); + let _entered = walredo_extraordinary_shutdown_thread_span.enter(); + if let Ok(()) = rt.block_on(tokio::time::timeout( + Duration::from_secs(8), + walredo_extraordinary_shutdown_thread_cancel.cancelled(), + )) { + info!("cancellation requested"); + return; + } + let managers = tenant::WALREDO_MANAGERS + .lock() + .unwrap() + // prevents new walredo managers from being inserted + .take() + .expect("only we take()"); + // Use FuturesUnordered to get in queue early for each manager's + // heavier_once_cell semaphore wait list. + // Also, for idle tenants that for some reason haven't + // shut down yet, it's quite likely that we're not going + // to get Poll::Pending once. + let mut futs: FuturesUnordered<_> = managers + .into_iter() + .filter_map(|(_, mgr)| mgr.upgrade()) + .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await }) + .collect(); + info!(count=%futs.len(), "built FuturesUnordered"); + let mut last_log_at = std::time::Instant::now(); + #[derive(Debug, Default)] + struct Results { + initiated: u64, + already: u64, + } + let mut results = Results::default(); + while let Some(we_initiated) = rt.block_on(futs.next()) { + if we_initiated { + results.initiated += 1; + } else { + results.already += 1; + } + if last_log_at.elapsed() > Duration::from_millis(100) { + info!(remaining=%futs.len(), ?results, "progress"); + last_log_at = std::time::Instant::now(); + } + } + info!(?results, "done"); + } + }); + // Shut down the libpq endpoint task. This prevents new connections from // being accepted. - timed( - task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None), + let remaining_connections = timed( + page_service.stop_accepting(), "shutdown LibpqEndpointListener", Duration::from_secs(1), ) @@ -82,7 +183,7 @@ pub async fn shutdown_pageserver( // Shut down any page service tasks: any in-progress work for particular timelines or tenants // should already have been canclled via mgr::shutdown_all_tenants timed( - task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None), + remaining_connections.shutdown(), "shutdown PageRequestHandlers", Duration::from_secs(1), ) @@ -91,16 +192,44 @@ pub async fn shutdown_pageserver( // Best effort to persist any outstanding deletions, to avoid leaking objects deletion_queue.shutdown(Duration::from_secs(5)).await; + timed( + consumption_metrics_worker.0.shutdown(), + "shutdown consumption metrics", + Duration::from_secs(1), + ) + .await; + + timed( + futures::future::OptionFuture::from(disk_usage_eviction_task.map(|t| t.0.shutdown())), + "shutdown disk usage eviction", + Duration::from_secs(1), + ) + .await; + + timed( + background_purges.shutdown(), + "shutdown background purges", + Duration::from_secs(1), + ) + .await; + // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. timed( - task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None), + http_listener.0.shutdown(), "shutdown http", Duration::from_secs(1), ) .await; + timed( + secondary_controller_tasks.wait(), // cancellation happened in caller + "secondary controller wait", + Duration::from_secs(1), + ) + .await; + // There should be nothing left, but let's be sure timed( task_mgr::shutdown_tasks(None, None, None), @@ -108,16 +237,21 @@ pub async fn shutdown_pageserver( Duration::from_secs(1), ) .await; - info!("Shut down successfully completed"); + + info!("cancel & join walredo_extraordinary_shutdown_thread"); + walredo_extraordinary_shutdown_thread_cancel.cancel(); + walredo_extraordinary_shutdown_thread.join().unwrap(); + info!("walredo_extraordinary_shutdown_thread done"); + + info!( + elapsed_ms = started_at.elapsed().as_millis(), + "Shut down successfully completed" + ); std::process::exit(exit_code); } /// Per-tenant configuration file. -/// Full path: `tenants//config`. -pub(crate) const TENANT_CONFIG_NAME: &str = "config"; - -/// Per-tenant configuration file. -/// Full path: `tenants//config`. +/// Full path: `tenants//config-v1`. pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1"; /// Per-tenant copy of their remote heatmap, downloaded into the local @@ -136,13 +270,6 @@ pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete"; -/// A marker file to prevent pageserver from loading a certain tenant on restart. -/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding -/// `ignore` management API command, that expects the ignored tenant to be properly loaded -/// into pageserver's memory before being ignored. -/// Full path: `tenants//___ignored_tenant`. -pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant"; - pub fn is_temporary(path: &Utf8Path) -> bool { match path.file_name() { Some(name) => name.ends_with(TEMP_FILE_SUFFIX), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index e8a1e063c5..0a1a22b6e8 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -8,7 +8,7 @@ use metrics::{ }; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; -use strum::{EnumCount, IntoEnumIterator, VariantNames}; +use strum::{EnumCount, VariantNames}; use strum_macros::{EnumVariantNames, IntoStaticStr}; use tracing::warn; use utils::id::TimelineId; @@ -53,9 +53,6 @@ pub(crate) enum StorageTimeOperation { #[strum(serialize = "find gc cutoffs")] FindGcCutoffs, - - #[strum(serialize = "create tenant")] - CreateTenant, } pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { @@ -145,14 +142,6 @@ impl ReconstructTimeMetrics { } } -pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_materialized_cache_hits_direct_total", - "Number of cache hits from materialized page cache without redo", - ) - .expect("failed to define a metric") -}); - pub(crate) struct ReconstructDataTimeMetrics { singular: Histogram, vectored: Histogram, @@ -182,14 +171,6 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = } }); -pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_materialized_cache_hits_total", - "Number of cache hits from materialized page cache", - ) - .expect("failed to define a metric") -}); - pub(crate) struct GetVectoredLatency { map: EnumMap>, } @@ -298,12 +279,8 @@ pub(crate) static SCAN_LATENCY: Lazy = Lazy::new(|| { }); pub(crate) struct PageCacheMetricsForTaskKind { - pub read_accesses_materialized_page: IntCounter, pub read_accesses_immutable: IntCounter, - pub read_hits_immutable: IntCounter, - pub read_hits_materialized_page_exact: IntCounter, - pub read_hits_materialized_page_older_lsn: IntCounter, } pub(crate) struct PageCacheMetrics { @@ -336,16 +313,6 @@ pub(crate) static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMet let content_kind = ::from_usize(content_kind); let content_kind: &'static str = content_kind.into(); PageCacheMetricsForTaskKind { - read_accesses_materialized_page: { - PAGE_CACHE_READ_ACCESSES - .get_metric_with_label_values(&[ - task_kind, - "materialized_page", - content_kind, - ]) - .unwrap() - }, - read_accesses_immutable: { PAGE_CACHE_READ_ACCESSES .get_metric_with_label_values(&[task_kind, "immutable", content_kind]) @@ -357,28 +324,6 @@ pub(crate) static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMet .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"]) .unwrap() }, - - read_hits_materialized_page_exact: { - PAGE_CACHE_READ_HITS - .get_metric_with_label_values(&[ - task_kind, - "materialized_page", - content_kind, - "exact", - ]) - .unwrap() - }, - - read_hits_materialized_page_older_lsn: { - PAGE_CACHE_READ_HITS - .get_metric_with_label_values(&[ - task_kind, - "materialized_page", - content_kind, - "older_lsn", - ]) - .unwrap() - }, } })) })), @@ -394,7 +339,6 @@ pub(crate) struct PageCacheSizeMetrics { pub max_bytes: UIntGauge, pub current_bytes_immutable: UIntGauge, - pub current_bytes_materialized_page: UIntGauge, } static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy = Lazy::new(|| { @@ -420,11 +364,6 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy = .get_metric_with_label_values(&["immutable"]) .unwrap() }, - current_bytes_materialized_page: { - PAGE_CACHE_SIZE_CURRENT_BYTES - .get_metric_with_label_values(&["materialized_page"]) - .unwrap() - }, }); pub(crate) mod page_cache_eviction_metrics { @@ -525,6 +464,49 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_pitr_history_size", + "Data written since PITR cutoff on this timeline", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum MetricLayerKind { + Delta, + Image, +} + +static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_bytes", + "Sum of layer physical sizes in bytes", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + +static TIMELINE_LAYER_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_count", + "Number of layers that exist", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + +static TIMELINE_ARCHIVE_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_archive_size", + "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + static STANDBY_HORIZON: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_standby_horizon", @@ -537,6 +519,15 @@ static STANDBY_HORIZON: Lazy = Lazy::new(|| { static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_resident_physical_size", + "The size of the layer files present in the pageserver's filesystem, for attached locations.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static VISIBLE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_visible_physical_size", "The size of the layer files present in the pageserver's filesystem.", &["tenant_id", "shard_id", "timeline_id"] ) @@ -603,6 +594,63 @@ static AUX_FILE_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static VALID_LSN_LEASE_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_valid_lsn_lease_count", + "The number of valid leases after refreshing gc info.", + &["tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); + +pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_circuit_breaker_broken", + "How many times a circuit breaker has broken" + ) + .expect("failed to define a metric") +}); + +pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_circuit_breaker_unbroken", + "How many times a circuit breaker has been un-broken (recovered)" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_in_bytes_total", + "Size of data written into image layers before compression" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_in_bytes_considered", + "Size of potentially compressible data written into image layers before compression" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_in_bytes_chosen", + "Size of data whose compressed form was written into image layers" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_out_bytes_total", + "Size of compressed image layer written" + ) + .expect("failed to define a metric") +}); + pub(crate) mod initial_logical_size { use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; @@ -1128,21 +1176,12 @@ pub(crate) mod virtual_file_io_engine { }); } -#[derive(Debug)] -struct GlobalAndPerTimelineHistogram { - global: Histogram, - per_tenant_timeline: Histogram, -} - -impl GlobalAndPerTimelineHistogram { - fn observe(&self, value: f64) { - self.global.observe(value); - self.per_tenant_timeline.observe(value); - } -} - struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { - h: &'a GlobalAndPerTimelineHistogram, + global_metric: &'a Histogram, + + // Optional because not all op types are tracked per-timeline + timeline_metric: Option<&'a Histogram>, + ctx: &'c RequestContext, start: std::time::Instant, op: SmgrQueryType, @@ -1173,7 +1212,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> { elapsed } }; - self.h.observe(ex_throttled.as_secs_f64()); + self.global_metric.observe(ex_throttled.as_secs_f64()); + if let Some(timeline_metric) = self.timeline_metric { + timeline_metric.observe(ex_throttled.as_secs_f64()); + } } } @@ -1198,7 +1240,8 @@ pub enum SmgrQueryType { #[derive(Debug)] pub(crate) struct SmgrQueryTimePerTimeline { - metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT], + global_metrics: [Histogram; SmgrQueryType::COUNT], + per_timeline_getpage: Histogram, } static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { @@ -1276,27 +1319,32 @@ impl SmgrQueryTimePerTimeline { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_slug = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); - let metrics = std::array::from_fn(|i| { + let global_metrics = std::array::from_fn(|i| { let op = SmgrQueryType::from_repr(i).unwrap(); - let global = SMGR_QUERY_TIME_GLOBAL + SMGR_QUERY_TIME_GLOBAL .get_metric_with_label_values(&[op.into()]) - .unwrap(); - let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE - .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id]) - .unwrap(); - GlobalAndPerTimelineHistogram { - global, - per_tenant_timeline, - } + .unwrap() }); - Self { metrics } + + let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE + .get_metric_with_label_values(&[ + SmgrQueryType::GetPageAtLsn.into(), + &tenant_id, + &shard_slug, + &timeline_id, + ]) + .unwrap(); + Self { + global_metrics, + per_timeline_getpage, + } } pub(crate) fn start_timer<'c: 'a, 'a>( &'a self, op: SmgrQueryType, ctx: &'c RequestContext, - ) -> impl Drop + '_ { - let metric = &self.metrics[op as usize]; + ) -> Option { + let global_metric = &self.global_metrics[op as usize]; let start = Instant::now(); match ctx.micros_spent_throttled.open() { Ok(()) => (), @@ -1315,12 +1363,20 @@ impl SmgrQueryTimePerTimeline { }); } } - GlobalAndPerTimelineHistogramTimer { - h: metric, + + let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) { + Some(&self.per_timeline_getpage) + } else { + None + }; + + Some(GlobalAndPerTimelineHistogramTimer { + global_metric, + timeline_metric, ctx, start, op, - } + }) } } @@ -1367,17 +1423,9 @@ mod smgr_query_time_tests { let get_counts = || { let global: u64 = ops .iter() - .map(|op| metrics.metrics[*op as usize].global.get_sample_count()) + .map(|op| metrics.global_metrics[*op as usize].get_sample_count()) .sum(); - let per_tenant_timeline: u64 = ops - .iter() - .map(|op| { - metrics.metrics[*op as usize] - .per_tenant_timeline - .get_sample_count() - }) - .sum(); - (global, per_tenant_timeline) + (global, metrics.per_timeline_getpage.get_sample_count()) }; let (pre_global, pre_per_tenant_timeline) = get_counts(); @@ -1388,7 +1436,12 @@ mod smgr_query_time_tests { drop(timer); let (post_global, post_per_tenant_timeline) = get_counts(); - assert_eq!(post_per_tenant_timeline, 1); + if matches!(op, super::SmgrQueryType::GetPageAtLsn) { + // getpage ops are tracked per-timeline, others aren't + assert_eq!(post_per_tenant_timeline, 1); + } else { + assert_eq!(post_per_tenant_timeline, 0); + } assert!(post_global > pre_global); } } @@ -1405,17 +1458,23 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { .map(|ms| (ms as f64) / 1000.0) }); -pub(crate) struct BasebackupQueryTime(HistogramVec); +pub(crate) struct BasebackupQueryTime { + ok: Histogram, + error: Histogram, +} + pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { - BasebackupQueryTime({ - register_histogram_vec!( - "pageserver_basebackup_query_seconds", - "Histogram of basebackup queries durations, by result type", - &["result"], - COMPUTE_STARTUP_BUCKETS.to_vec(), - ) - .expect("failed to define a metric") - }) + let vec = register_histogram_vec!( + "pageserver_basebackup_query_seconds", + "Histogram of basebackup queries durations, by result type", + &["result"], + COMPUTE_STARTUP_BUCKETS.to_vec(), + ) + .expect("failed to define a metric"); + BasebackupQueryTime { + ok: vec.get_metric_with_label_values(&["ok"]).unwrap(), + error: vec.get_metric_with_label_values(&["error"]).unwrap(), + } }); pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> { @@ -1470,25 +1529,62 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { elapsed } }; - let label_value = if res.is_ok() { "ok" } else { "error" }; - let metric = self - .parent - .0 - .get_metric_with_label_values(&[label_value]) - .unwrap(); + let metric = if res.is_ok() { + &self.parent.ok + } else { + &self.parent.error + }; metric.observe(ex_throttled.as_secs_f64()); } } -pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_live_connections", - "Number of live network connections", +pub(crate) static LIVE_CONNECTIONS: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "pageserver_live_connections_started", + "Number of network connections that we started handling", + "pageserver_live_connections_finished", + "Number of network connections that we finished handling", &["pageserver_connection_kind"] ) .expect("failed to define a metric") }); +#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)] +pub(crate) enum ComputeCommandKind { + PageStreamV2, + PageStream, + Basebackup, + Fullbackup, + LeaseLsn, +} + +pub(crate) struct ComputeCommandCounters { + map: EnumMap, +} + +pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy = Lazy::new(|| { + let inner = register_int_counter_vec!( + "pageserver_compute_commands", + "Number of compute -> pageserver commands processed", + &["command"] + ) + .expect("failed to define a metric"); + + ComputeCommandCounters { + map: EnumMap::from_array(std::array::from_fn(|i| { + let command = ::from_usize(i); + let command_str: &'static str = command.into(); + inner.with_label_values(&[command_str]) + })), + } +}); + +impl ComputeCommandCounters { + pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter { + &self.map[command] + } +} + // remote storage metrics static REMOTE_TIMELINE_CLIENT_CALLS: Lazy = Lazy::new(|| { @@ -1698,6 +1794,24 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| { } }); +pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_secondary_resident_physical_size", + "The size of the layer files present in the pageserver's filesystem, for secondary locations.", + &["tenant_id", "shard_id"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_secondary_heatmap_total_size", + "The total size in bytes of all layers in the most recently downloaded heatmap.", + &["tenant_id", "shard_id"] + ) + .expect("failed to define a metric") +}); + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, @@ -1748,16 +1862,64 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); -pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "pageserver_background_loop_semaphore_wait_start_count", - "Counter for background loop concurrency-limiting semaphore acquire calls started", - "pageserver_background_loop_semaphore_wait_finish_count", - "Counter for background loop concurrency-limiting semaphore acquire calls finished", - &["task"], - ) - .unwrap() -}); +pub struct BackgroundLoopSemaphoreMetrics { + counters: EnumMap, + durations: EnumMap, +} + +pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy = Lazy::new( + || { + let counters = register_int_counter_pair_vec!( + "pageserver_background_loop_semaphore_wait_start_count", + "Counter for background loop concurrency-limiting semaphore acquire calls started", + "pageserver_background_loop_semaphore_wait_finish_count", + "Counter for background loop concurrency-limiting semaphore acquire calls finished", + &["task"], + ) + .unwrap(); + + let durations = register_counter_vec!( + "pageserver_background_loop_semaphore_wait_duration_seconds", + "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls", + &["task"], + ) + .unwrap(); + + BackgroundLoopSemaphoreMetrics { + counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| { + let kind = ::from_usize(i); + counters.with_label_values(&[kind.into()]) + })), + durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| { + let kind = ::from_usize(i); + durations.with_label_values(&[kind.into()]) + })), + } + }, +); + +impl BackgroundLoopSemaphoreMetrics { + pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ { + struct Record<'a> { + metrics: &'a BackgroundLoopSemaphoreMetrics, + task: BackgroundLoopKind, + _counter_guard: metrics::IntCounterPairGuard, + start: Instant, + } + impl Drop for Record<'_> { + fn drop(&mut self) { + let elapsed = self.start.elapsed().as_secs_f64(); + self.metrics.durations[self.task].inc_by(elapsed); + } + } + Record { + metrics: self, + task, + _counter_guard: self.counters[task].guard(), + start: Instant::now(), + } + } +} pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( @@ -2100,14 +2262,23 @@ pub(crate) struct TimelineMetrics { pub garbage_collect_histo: StorageTimeMetrics, pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, + pub pitr_history_size: UIntGauge, + pub archival_size: UIntGauge, + pub(crate) layer_size_image: UIntGauge, + pub(crate) layer_count_image: UIntGauge, + pub(crate) layer_size_delta: UIntGauge, + pub(crate) layer_count_delta: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, + pub visible_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub aux_file_size_gauge: IntGauge, pub directory_entries_count_gauge: Lazy UIntGauge>>, pub evictions: IntCounter, pub evictions_with_low_residence_duration: std::sync::RwLock, + /// Number of valid LSN leases. + pub valid_lsn_lease_count_gauge: UIntGauge, shutdown: std::sync::atomic::AtomicBool, } @@ -2171,12 +2342,60 @@ impl TimelineMetrics { let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + + let pitr_history_size = PITR_HISTORY_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + + let archival_size = TIMELINE_ARCHIVE_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + + let layer_size_image = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_count_image = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_size_delta = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + + let layer_count_delta = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); // TODO: we shouldn't expose this metric let current_logical_size_gauge = CURRENT_LOGICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) @@ -2206,6 +2425,10 @@ impl TimelineMetrics { let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder .build(&tenant_id, &shard_id, &timeline_id); + let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + TimelineMetrics { tenant_id, shard_id, @@ -2219,8 +2442,15 @@ impl TimelineMetrics { find_gc_cutoffs_histo, load_layer_map_histo, last_record_gauge, + pitr_history_size, + archival_size, + layer_size_image, + layer_count_image, + layer_size_delta, + layer_count_delta, standby_horizon_gauge, resident_physical_size_gauge, + visible_physical_size_gauge, current_logical_size_gauge, aux_file_size_gauge, directory_entries_count_gauge, @@ -2228,6 +2458,7 @@ impl TimelineMetrics { evictions_with_low_residence_duration: std::sync::RwLock::new( evictions_with_low_residence_duration, ), + valid_lsn_lease_count_gauge, shutdown: std::sync::atomic::AtomicBool::default(), } } @@ -2271,12 +2502,43 @@ impl TimelineMetrics { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); } + let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) { let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); } + + let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); self.evictions_with_low_residence_duration .write() @@ -2307,14 +2569,12 @@ impl TimelineMetrics { let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); } - for op in SmgrQueryType::iter() { - let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[ - op.into(), - tenant_id, - shard_id, - timeline_id, - ]); - } + let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[ + SmgrQueryType::GetPageAtLsn.into(), + tenant_id, + shard_id, + timeline_id, + ]); } } @@ -2341,6 +2601,7 @@ use std::time::{Duration, Instant}; use crate::context::{PageContentKind, RequestContext}; use crate::task_mgr::TaskKind; use crate::tenant::mgr::TenantSlot; +use crate::tenant::tasks::BackgroundLoopKind; /// Maintain a per timeline gauge in addition to the global gauge. pub(crate) struct PerTimelineRemotePhysicalSizeGauge { @@ -2918,13 +3179,11 @@ pub fn preinitialize_metrics() { // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of // order: // - global metrics reside in a Lazy - // - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc() + // - access via crate::metrics::PS_METRICS.some_metric.inc() // - could move the statics into TimelineMetrics::new()? // counters [ - &MATERIALIZED_PAGE_CACHE_HIT, - &MATERIALIZED_PAGE_CACHE_HIT_DIRECT, &UNEXPECTED_ONDEMAND_DOWNLOADS, &WALRECEIVER_STARTED_CONNECTIONS, &WALRECEIVER_BROKER_UPDATES, @@ -2934,6 +3193,8 @@ pub fn preinitialize_metrics() { &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES, &REMOTE_ONDEMAND_DOWNLOADED_LAYERS, &REMOTE_ONDEMAND_DOWNLOADED_BYTES, + &CIRCUIT_BREAKERS_BROKEN, + &CIRCUIT_BREAKERS_UNBROKEN, ] .into_iter() .for_each(|c| { @@ -2986,4 +3247,6 @@ pub fn preinitialize_metrics() { // Custom Lazy::force(&RECONSTRUCT_TIME); Lazy::force(&tenant_throttling::TIMELINE_GET); + Lazy::force(&BASEBACKUP_QUERY_TIME); + Lazy::force(&COMPUTE_COMMANDS_COUNTERS); } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 529fb9bb07..f386c825b8 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -17,7 +17,6 @@ //! //! Two types of pages are supported: //! -//! * **Materialized pages**, filled & used by page reconstruction //! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`]. //! //! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only. @@ -28,9 +27,6 @@ //! Page cache maps from a cache key to a buffer slot. //! The cache key uniquely identifies the piece of data that is being cached. //! -//! The cache key for **materialized pages** is [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`]. -//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access. -//! //! The cache key for **immutable file** pages is [`FileId`] and a block number. //! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following: //! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`]. @@ -82,13 +78,10 @@ use std::{ use anyhow::Context; use once_cell::sync::OnceCell; -use pageserver_api::shard::TenantShardId; -use utils::{id::TimelineId, lsn::Lsn}; use crate::{ context::RequestContext, metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics}, - repository::Key, }; static PAGE_CACHE: OnceCell = OnceCell::new(); @@ -139,33 +132,7 @@ pub fn next_file_id() -> FileId { #[derive(Debug, PartialEq, Eq, Clone)] #[allow(clippy::enum_variant_names)] enum CacheKey { - MaterializedPage { - hash_key: MaterializedPageHashKey, - lsn: Lsn, - }, - ImmutableFilePage { - file_id: FileId, - blkno: u32, - }, -} - -#[derive(Debug, PartialEq, Eq, Hash, Clone)] -struct MaterializedPageHashKey { - /// Why is this TenantShardId rather than TenantId? - /// - /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this - /// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this - /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are - /// special-cased in some other way. - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - key: Key, -} - -#[derive(Clone)] -struct Version { - lsn: Lsn, - slot_idx: usize, + ImmutableFilePage { file_id: FileId, blkno: u32 }, } struct Slot { @@ -236,17 +203,6 @@ impl SlotInner { } pub struct PageCache { - /// This contains the mapping from the cache key to buffer slot that currently - /// contains the page, if any. - /// - /// TODO: This is protected by a single lock. If that becomes a bottleneck, - /// this HashMap can be replaced with a more concurrent version, there are - /// plenty of such crates around. - /// - /// If you add support for caching different kinds of objects, each object kind - /// can have a separate mapping map, next to this field. - materialized_page_map: std::sync::RwLock>>, - immutable_page_map: std::sync::RwLock>, /// The actual buffers with their metadata. @@ -371,175 +327,14 @@ pub enum ReadBufResult<'a> { } impl PageCache { - // - // Section 1.1: Public interface functions for looking up and memorizing materialized page - // versions in the page cache - // - - /// Look up a materialized page version. - /// - /// The 'lsn' is an upper bound, this will return the latest version of - /// the given block, but not newer than 'lsn'. Returns the actual LSN of the - /// returned page. - pub async fn lookup_materialized_page( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - key: &Key, - lsn: Lsn, - ctx: &RequestContext, - ) -> Option<(Lsn, PageReadGuard)> { - let Ok(permit) = self.try_get_pinned_slot_permit().await else { - return None; - }; - - crate::metrics::PAGE_CACHE - .for_ctx(ctx) - .read_accesses_materialized_page - .inc(); - - let mut cache_key = CacheKey::MaterializedPage { - hash_key: MaterializedPageHashKey { - tenant_shard_id, - timeline_id, - key: *key, - }, - lsn, - }; - - if let Some(guard) = self - .try_lock_for_read(&mut cache_key, &mut Some(permit)) - .await - { - if let CacheKey::MaterializedPage { - hash_key: _, - lsn: available_lsn, - } = cache_key - { - if available_lsn == lsn { - crate::metrics::PAGE_CACHE - .for_ctx(ctx) - .read_hits_materialized_page_exact - .inc(); - } else { - crate::metrics::PAGE_CACHE - .for_ctx(ctx) - .read_hits_materialized_page_older_lsn - .inc(); - } - Some((available_lsn, guard)) - } else { - panic!("unexpected key type in slot"); - } - } else { - None - } - } - - /// - /// Store an image of the given page in the cache. - /// - pub async fn memorize_materialized_page( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - key: Key, - lsn: Lsn, - img: &[u8], - ) -> anyhow::Result<()> { - let cache_key = CacheKey::MaterializedPage { - hash_key: MaterializedPageHashKey { - tenant_shard_id, - timeline_id, - key, - }, - lsn, - }; - - let mut permit = Some(self.try_get_pinned_slot_permit().await?); - loop { - // First check if the key already exists in the cache. - if let Some(slot_idx) = self.search_mapping_exact(&cache_key) { - // The page was found in the mapping. Lock the slot, and re-check - // that it's still what we expected (because we don't released the mapping - // lock already, another thread could have evicted the page) - let slot = &self.slots[slot_idx]; - let inner = slot.inner.write().await; - if inner.key.as_ref() == Some(&cache_key) { - slot.inc_usage_count(); - debug_assert!( - { - let guard = inner.permit.lock().unwrap(); - guard.upgrade().is_none() - }, - "we hold a write lock, so, no one else should have a permit" - ); - debug_assert_eq!(inner.buf.len(), img.len()); - // We already had it in cache. Another thread must've put it there - // concurrently. Check that it had the same contents that we - // replayed. - assert!(inner.buf == img); - return Ok(()); - } - } - debug_assert!(permit.is_some()); - - // Not found. Find a victim buffer - let (slot_idx, mut inner) = self - .find_victim(permit.as_ref().unwrap()) - .await - .context("Failed to find evict victim")?; - - // Insert mapping for this. At this point, we may find that another - // thread did the same thing concurrently. In that case, we evicted - // our victim buffer unnecessarily. Put it into the free list and - // continue with the slot that the other thread chose. - if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) { - // TODO: put to free list - - // We now just loop back to start from beginning. This is not - // optimal, we'll perform the lookup in the mapping again, which - // is not really necessary because we already got - // 'existing_slot_idx'. But this shouldn't happen often enough - // to matter much. - continue; - } - - // Make the slot ready - let slot = &self.slots[slot_idx]; - inner.key = Some(cache_key.clone()); - slot.set_usage_count(1); - // Create a write guard for the slot so we go through the expected motions. - debug_assert!( - { - let guard = inner.permit.lock().unwrap(); - guard.upgrade().is_none() - }, - "we hold a write lock, so, no one else should have a permit" - ); - let mut write_guard = PageWriteGuard { - state: PageWriteGuardState::Invalid { - _permit: permit.take().unwrap(), - inner, - }, - }; - write_guard.copy_from_slice(img); - let _ = write_guard.mark_valid(); - return Ok(()); - } - } - - // Section 1.2: Public interface functions for working with immutable file pages. - pub async fn read_immutable_buf( &self, file_id: FileId, blkno: u32, ctx: &RequestContext, ) -> anyhow::Result { - let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno }; - - self.lock_for_read(&mut cache_key, ctx).await + self.lock_for_read(&(CacheKey::ImmutableFilePage { file_id, blkno }), ctx) + .await } // @@ -573,19 +368,11 @@ impl PageCache { /// Look up a page in the cache. /// - /// If the search criteria is not exact, *cache_key is updated with the key - /// for exact key of the returned page. (For materialized pages, that means - /// that the LSN in 'cache_key' is updated with the LSN of the returned page - /// version.) - /// - /// If no page is found, returns None and *cache_key is left unmodified. - /// async fn try_lock_for_read( &self, - cache_key: &mut CacheKey, + cache_key: &CacheKey, permit: &mut Option, ) -> Option { - let cache_key_orig = cache_key.clone(); if let Some(slot_idx) = self.search_mapping(cache_key) { // The page was found in the mapping. Lock the slot, and re-check // that it's still what we expected (because we released the mapping @@ -598,9 +385,6 @@ impl PageCache { _permit: inner.coalesce_readers_permit(permit.take().unwrap()), slot_guard: inner, }); - } else { - // search_mapping might have modified the search key; restore it. - *cache_key = cache_key_orig; } } None @@ -637,15 +421,12 @@ impl PageCache { /// async fn lock_for_read( &self, - cache_key: &mut CacheKey, + cache_key: &CacheKey, ctx: &RequestContext, ) -> anyhow::Result { let mut permit = Some(self.try_get_pinned_slot_permit().await?); let (read_access, hit) = match cache_key { - CacheKey::MaterializedPage { .. } => { - unreachable!("Materialized pages use lookup_materialized_page") - } CacheKey::ImmutableFilePage { .. } => ( &crate::metrics::PAGE_CACHE .for_ctx(ctx) @@ -717,52 +498,15 @@ impl PageCache { /// Search for a page in the cache using the given search key. /// - /// Returns the slot index, if any. If the search criteria is not exact, - /// *cache_key is updated with the actual key of the found page. + /// Returns the slot index, if any. /// /// NOTE: We don't hold any lock on the mapping on return, so the slot might /// get recycled for an unrelated page immediately after this function /// returns. The caller is responsible for re-checking that the slot still /// contains the page with the same key before using it. /// - fn search_mapping(&self, cache_key: &mut CacheKey) -> Option { + fn search_mapping(&self, cache_key: &CacheKey) -> Option { match cache_key { - CacheKey::MaterializedPage { hash_key, lsn } => { - let map = self.materialized_page_map.read().unwrap(); - let versions = map.get(hash_key)?; - - let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) { - Ok(version_idx) => version_idx, - Err(0) => return None, - Err(version_idx) => version_idx - 1, - }; - let version = &versions[version_idx]; - *lsn = version.lsn; - Some(version.slot_idx) - } - CacheKey::ImmutableFilePage { file_id, blkno } => { - let map = self.immutable_page_map.read().unwrap(); - Some(*map.get(&(*file_id, *blkno))?) - } - } - } - - /// Search for a page in the cache using the given search key. - /// - /// Like 'search_mapping, but performs an "exact" search. Used for - /// allocating a new buffer. - fn search_mapping_exact(&self, key: &CacheKey) -> Option { - match key { - CacheKey::MaterializedPage { hash_key, lsn } => { - let map = self.materialized_page_map.read().unwrap(); - let versions = map.get(hash_key)?; - - if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) { - Some(versions[version_idx].slot_idx) - } else { - None - } - } CacheKey::ImmutableFilePage { file_id, blkno } => { let map = self.immutable_page_map.read().unwrap(); Some(*map.get(&(*file_id, *blkno))?) @@ -775,27 +519,6 @@ impl PageCache { /// fn remove_mapping(&self, old_key: &CacheKey) { match old_key { - CacheKey::MaterializedPage { - hash_key: old_hash_key, - lsn: old_lsn, - } => { - let mut map = self.materialized_page_map.write().unwrap(); - if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) { - let versions = old_entry.get_mut(); - - if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) { - versions.remove(version_idx); - self.size_metrics - .current_bytes_materialized_page - .sub_page_sz(1); - if versions.is_empty() { - old_entry.remove_entry(); - } - } - } else { - panic!("could not find old key in mapping") - } - } CacheKey::ImmutableFilePage { file_id, blkno } => { let mut map = self.immutable_page_map.write().unwrap(); map.remove(&(*file_id, *blkno)) @@ -812,30 +535,6 @@ impl PageCache { /// of the existing mapping and leaves it untouched. fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option { match new_key { - CacheKey::MaterializedPage { - hash_key: new_key, - lsn: new_lsn, - } => { - let mut map = self.materialized_page_map.write().unwrap(); - let versions = map.entry(new_key.clone()).or_default(); - match versions.binary_search_by_key(new_lsn, |v| v.lsn) { - Ok(version_idx) => Some(versions[version_idx].slot_idx), - Err(version_idx) => { - versions.insert( - version_idx, - Version { - lsn: *new_lsn, - slot_idx, - }, - ); - self.size_metrics - .current_bytes_materialized_page - .add_page_sz(1); - None - } - } - } - CacheKey::ImmutableFilePage { file_id, blkno } => { let mut map = self.immutable_page_map.write().unwrap(); match map.entry((*file_id, *blkno)) { @@ -949,7 +648,6 @@ impl PageCache { let size_metrics = &crate::metrics::PAGE_CACHE_SIZE; size_metrics.max_bytes.set_page_sz(num_pages); size_metrics.current_bytes_immutable.set_page_sz(0); - size_metrics.current_bytes_materialized_page.set_page_sz(0); let slots = page_buffer .chunks_exact_mut(PAGE_SZ) @@ -968,7 +666,6 @@ impl PageCache { .collect(); Self { - materialized_page_map: Default::default(), immutable_page_map: Default::default(), slots, next_evict_slot: AtomicUsize::new(0), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index ebc23e8945..81294291a9 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -4,11 +4,8 @@ use anyhow::Context; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; -use bytes::Bytes; -use futures::stream::FuturesUnordered; -use futures::Stream; -use futures::StreamExt; -use pageserver_api::key::Key; +use futures::FutureExt; +use once_cell::sync::OnceCell; use pageserver_api::models::TenantState; use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, @@ -17,31 +14,23 @@ use pageserver_api::models::{ PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, PagestreamProtocolVersion, }; -use pageserver_api::shard::ShardIndex; -use pageserver_api::shard::ShardNumber; use pageserver_api::shard::TenantShardId; use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::borrow::Cow; -use std::collections::HashMap; use std::io; -use std::net::TcpListener; -use std::pin::pin; use std::str; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; use std::time::SystemTime; +use std::time::{Duration, Instant}; use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_util::io::StreamReader; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::ConnectionId; -use utils::sync::gate::GateGuard; use utils::{ auth::{Claims, Scope, SwappableJwtAuth}, id::{TenantId, TimelineId}, @@ -52,147 +41,150 @@ use utils::{ use crate::auth::check_permission; use crate::basebackup; use crate::basebackup::BasebackupError; +use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::import_datadir::import_wal_from_tar; use crate::metrics; -use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; use crate::pgdatadir_mapping::Version; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; -use crate::task_mgr; use crate::task_mgr::TaskKind; -use crate::tenant::mgr::GetActiveTenantError; -use crate::tenant::mgr::GetTenantError; -use crate::tenant::mgr::ShardResolveResult; +use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME}; use crate::tenant::mgr::ShardSelector; use crate::tenant::mgr::TenantManager; -use crate::tenant::timeline::FlushLayerError; -use crate::tenant::timeline::WaitLsnError; +use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult}; +use crate::tenant::timeline::{self, WaitLsnError}; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; -use crate::tenant::Tenant; use crate::tenant::Timeline; -use crate::trace::Tracer; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; -// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which -// is not yet in state [`TenantState::Active`]. +/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which +/// is not yet in state [`TenantState::Active`]. +/// +/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); -/// Read the end of a tar archive. -/// -/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. -/// `tokio_tar` already read the first such block. Read the second all-zeros block, -/// and check that there is no more data after the EOF marker. -/// -/// 'tar' command can also write extra blocks of zeros, up to a record -/// size, controlled by the --record-size argument. Ignore them too. -async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> { - use tokio::io::AsyncReadExt; - let mut buf = [0u8; 512]; +/////////////////////////////////////////////////////////////////////////////// - // Read the all-zeros block, and verify it - let mut total_bytes = 0; - while total_bytes < 512 { - let nbytes = reader.read(&mut buf[total_bytes..]).await?; - total_bytes += nbytes; - if nbytes == 0 { - break; - } - } - if total_bytes < 512 { - anyhow::bail!("incomplete or invalid tar EOF marker"); - } - if !buf.iter().all(|&x| x == 0) { - anyhow::bail!("invalid tar EOF marker"); - } - - // Drain any extra zero-blocks after the EOF marker - let mut trailing_bytes = 0; - let mut seen_nonzero_bytes = false; - loop { - let nbytes = reader.read(&mut buf).await?; - trailing_bytes += nbytes; - if !buf.iter().all(|&x| x == 0) { - seen_nonzero_bytes = true; - } - if nbytes == 0 { - break; - } - } - if seen_nonzero_bytes { - anyhow::bail!("unexpected non-zero bytes after the tar archive"); - } - if trailing_bytes % 512 != 0 { - anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"); - } - Ok(()) +pub struct Listener { + cancel: CancellationToken, + /// Cancel the listener task through `listen_cancel` to shut down the listener + /// and get a handle on the existing connections. + task: JoinHandle, } -/////////////////////////////////////////////////////////////////////////////// +pub struct Connections { + cancel: CancellationToken, + tasks: tokio::task::JoinSet, +} + +pub fn spawn( + conf: &'static PageServerConf, + tenant_manager: Arc, + pg_auth: Option>, + tcp_listener: tokio::net::TcpListener, +) -> Listener { + let cancel = CancellationToken::new(); + let libpq_ctx = RequestContext::todo_child( + TaskKind::LibpqEndpointListener, + // listener task shouldn't need to download anything. (We will + // create a separate sub-contexts for each connection, with their + // own download behavior. This context is used only to listen and + // accept connections.) + DownloadBehavior::Error, + ); + let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "libpq listener", + libpq_listener_main( + tenant_manager, + pg_auth, + tcp_listener, + conf.pg_auth_type, + libpq_ctx, + cancel.clone(), + ) + .map(anyhow::Ok), + )); + + Listener { cancel, task } +} + +impl Listener { + pub async fn stop_accepting(self) -> Connections { + self.cancel.cancel(); + self.task + .await + .expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error") + } +} +impl Connections { + pub(crate) async fn shutdown(self) { + let Self { cancel, mut tasks } = self; + cancel.cancel(); + while let Some(res) = tasks.join_next().await { + Self::handle_connection_completion(res); + } + } + + fn handle_connection_completion(res: Result, tokio::task::JoinError>) { + match res { + Ok(Ok(())) => {} + Ok(Err(e)) => error!("error in page_service connection task: {:?}", e), + Err(e) => error!("page_service connection task panicked: {:?}", e), + } + } +} /// /// Main loop of the page service. /// /// Listens for connections, and launches a new handler task for each. /// +/// Returns Ok(()) upon cancellation via `cancel`, returning the set of +/// open connections. +/// pub async fn libpq_listener_main( tenant_manager: Arc, - broker_client: storage_broker::BrokerClientChannel, auth: Option>, - listener: TcpListener, + listener: tokio::net::TcpListener, auth_type: AuthType, listener_ctx: RequestContext, - cancel: CancellationToken, -) -> anyhow::Result<()> { - listener.set_nonblocking(true)?; - let tokio_listener = tokio::net::TcpListener::from_std(listener)?; + listener_cancel: CancellationToken, +) -> Connections { + let connections_cancel = CancellationToken::new(); + let mut connection_handler_tasks = tokio::task::JoinSet::default(); - // Wait for a new connection to arrive, or for server shutdown. - while let Some(res) = tokio::select! { - biased; + loop { + let accepted = tokio::select! { + biased; + _ = listener_cancel.cancelled() => break, + next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => { + let res = next.expect("we dont poll while empty"); + Connections::handle_connection_completion(res); + continue; + } + accepted = listener.accept() => accepted, + }; - _ = cancel.cancelled() => { - // We were requested to shut down. - None - } - - res = tokio_listener.accept() => { - Some(res) - } - } { - match res { + match accepted { Ok((socket, peer_addr)) => { // Connection established. Spawn a new task to handle it. debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); - let connection_ctx = listener_ctx .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download); - - // PageRequestHandler tasks are not associated with any particular - // timeline in the task manager. In practice most connections will - // only deal with a particular timeline, but we don't know which one - // yet. - task_mgr::spawn( - &tokio::runtime::Handle::current(), - TaskKind::PageRequestHandler, - None, - None, - "serving compute connection task", - false, - page_service_conn_main( - tenant_manager.clone(), - broker_client.clone(), - local_auth, - socket, - auth_type, - connection_ctx, - ), - ); + connection_handler_tasks.spawn(page_service_conn_main( + tenant_manager.clone(), + local_auth, + socket, + auth_type, + connection_ctx, + connections_cancel.child_token(), + )); } Err(err) => { // accept() failed. Log the error, and loop back to retry on next connection. @@ -201,28 +193,28 @@ pub async fn libpq_listener_main( } } - debug!("page_service loop terminated"); + debug!("page_service listener loop terminated"); - Ok(()) + Connections { + cancel: connections_cancel, + tasks: connection_handler_tasks, + } } +type ConnectionHandlerResult = anyhow::Result<()>; + #[instrument(skip_all, fields(peer_addr))] async fn page_service_conn_main( tenant_manager: Arc, - broker_client: storage_broker::BrokerClientChannel, auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, connection_ctx: RequestContext, -) -> anyhow::Result<()> { - // Immediately increment the gauge, then create a job to decrement it on task exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); - } + cancel: CancellationToken, +) -> ConnectionHandlerResult { + let _guard = LIVE_CONNECTIONS + .with_label_values(&["page_service"]) + .guard(); socket .set_nodelay(true) @@ -268,13 +260,10 @@ async fn page_service_conn_main( // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. let mut conn_handler = - PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx); + PageServerHandler::new(tenant_manager, auth, connection_ctx, cancel.clone()); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; - match pgbackend - .run(&mut conn_handler, task_mgr::shutdown_watcher) - .await - { + match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { // we've been requested to shut down Ok(()) @@ -291,33 +280,154 @@ async fn page_service_conn_main( } } -/// While a handler holds a reference to a Timeline, it also holds a the -/// timeline's Gate open. -struct HandlerTimeline { - timeline: Arc, - _guard: GateGuard, -} - struct PageServerHandler { - broker_client: storage_broker::BrokerClientChannel, auth: Option>, claims: Option, - tenant_manager: Arc, - /// The context created for the lifetime of the connection /// services by this PageServerHandler. /// For each query received over the connection, /// `process_query` creates a child context from this one. connection_ctx: RequestContext, - /// See [`Self::cache_timeline`] for usage. - /// + cancel: CancellationToken, + + timeline_handles: TimelineHandles, +} + +struct TimelineHandles { + wrapper: TenantManagerWrapper, /// Note on size: the typical size of this map is 1. The largest size we expect /// to see is the number of shards divided by the number of pageservers (typically < 2), /// or the ratio used when splitting shards (i.e. how many children created from one) /// parent shard, where a "large" number might be ~8. - shard_timelines: HashMap, + handles: timeline::handle::Cache, +} + +impl TimelineHandles { + fn new(tenant_manager: Arc) -> Self { + Self { + wrapper: TenantManagerWrapper { + tenant_manager, + tenant_id: OnceCell::new(), + }, + handles: Default::default(), + } + } + async fn get( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> Result, GetActiveTimelineError> { + if *self.wrapper.tenant_id.get_or_init(|| tenant_id) != tenant_id { + return Err(GetActiveTimelineError::Tenant( + GetActiveTenantError::SwitchedTenant, + )); + } + self.handles + .get(timeline_id, shard_selector, &self.wrapper) + .await + .map_err(|e| match e { + timeline::handle::GetError::TenantManager(e) => e, + timeline::handle::GetError::TimelineGateClosed => { + trace!("timeline gate closed"); + GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) + } + timeline::handle::GetError::PerTimelineStateShutDown => { + trace!("per-timeline state shut down"); + GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) + } + }) + } +} + +pub(crate) struct TenantManagerWrapper { + tenant_manager: Arc, + // We do not support switching tenant_id on a connection at this point. + // We can can add support for this later if needed without changing + // the protocol. + tenant_id: once_cell::sync::OnceCell, +} + +#[derive(Debug)] +pub(crate) struct TenantManagerTypes; + +impl timeline::handle::Types for TenantManagerTypes { + type TenantManagerError = GetActiveTimelineError; + type TenantManager = TenantManagerWrapper; + type Timeline = Arc; +} + +impl timeline::handle::ArcTimeline for Arc { + fn gate(&self) -> &utils::sync::gate::Gate { + &self.gate + } + + fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId { + Timeline::shard_timeline_id(self) + } + + fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState { + &self.handles + } + + fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity { + Timeline::get_shard_identity(self) + } +} + +impl timeline::handle::TenantManager for TenantManagerWrapper { + async fn resolve( + &self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> Result, GetActiveTimelineError> { + let tenant_id = self.tenant_id.get().expect("we set this in get()"); + let timeout = ACTIVE_TENANT_TIMEOUT; + let wait_start = Instant::now(); + let deadline = wait_start + timeout; + let tenant_shard = loop { + let resolved = self + .tenant_manager + .resolve_attached_shard(tenant_id, shard_selector); + match resolved { + ShardResolveResult::Found(tenant_shard) => break tenant_shard, + ShardResolveResult::NotFound => { + return Err(GetActiveTimelineError::Tenant( + GetActiveTenantError::NotFound(GetTenantError::NotFound(*tenant_id)), + )); + } + ShardResolveResult::InProgress(barrier) => { + // We can't authoritatively answer right now: wait for InProgress state + // to end, then try again + tokio::select! { + _ = barrier.wait() => { + // The barrier completed: proceed around the loop to try looking up again + }, + _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { + return Err(GetActiveTimelineError::Tenant(GetActiveTenantError::WaitForActiveTimeout { + latest_state: None, + wait_time: timeout, + })); + } + } + } + }; + }; + + tracing::debug!("Waiting for tenant to enter active state..."); + tenant_shard + .wait_to_become_active(deadline.duration_since(Instant::now())) + .await + .map_err(GetActiveTimelineError::Tenant)?; + + let timeline = tenant_shard + .get_timeline(timeline_id, true) + .map_err(GetActiveTimelineError::Timeline)?; + set_tracing_field_shard_id(&timeline); + Ok(timeline) + } } #[derive(thiserror::Error, Debug)] @@ -361,7 +471,11 @@ impl From for PageStreamError { impl From for PageStreamError { fn from(value: GetActiveTimelineError) -> Self { match value { - GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown, + GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) + | GetActiveTimelineError::Tenant(GetActiveTenantError::WillNotBecomeActive( + TenantState::Stopping { .. }, + )) + | GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) => Self::Shutdown, GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()), GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()), } @@ -391,68 +505,19 @@ impl From for QueryError { impl PageServerHandler { pub fn new( tenant_manager: Arc, - broker_client: storage_broker::BrokerClientChannel, auth: Option>, connection_ctx: RequestContext, + cancel: CancellationToken, ) -> Self { PageServerHandler { - tenant_manager, - broker_client, auth, claims: None, connection_ctx, - shard_timelines: HashMap::new(), + timeline_handles: TimelineHandles::new(tenant_manager), + cancel, } } - /// Future that completes when we need to shut down the connection. - /// - /// We currently need to shut down when any of the following happens: - /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled - /// 2. task_mgr requests shutdown of the connection - /// - /// NB on (1): the connection's lifecycle is not actually tied to any of the - /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current - /// implementation to be responsive to timeline cancellation because - /// the connection holds their `GateGuards` open (sored in `shard_timelines`). - /// We currently do the easy thing and terminate the connection if any of the - /// shard_timelines gets cancelled. But really, we cuold spend more effort - /// and simply remove the cancelled timeline from the `shard_timelines`, thereby - /// dropping the guard. - /// - /// NB: keep in sync with [`Self::is_connection_cancelled`] - async fn await_connection_cancelled(&self) { - // A short wait before we expend the cycles to walk our timeline map. This avoids incurring - // that cost every time we check for cancellation. - tokio::time::sleep(Duration::from_millis(10)).await; - - // This function is never called concurrently with code that adds timelines to shard_timelines, - // which is enforced by the borrow checker (the future returned by this function carries the - // immutable &self). So it's fine to evaluate shard_timelines after the sleep, we don't risk - // missing any inserts to the map. - - let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len()); - use futures::future::Either; - cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher())); - cancellation_sources.extend( - self.shard_timelines - .values() - .map(|ht| Either::Right(ht.timeline.cancel.cancelled())), - ); - FuturesUnordered::from_iter(cancellation_sources) - .next() - .await; - } - - /// Checking variant of [`Self::await_connection_cancelled`]. - fn is_connection_cancelled(&self) -> bool { - task_mgr::is_shutdown_requested() - || self - .shard_timelines - .values() - .any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping()) - } - /// This function always respects cancellation of any timeline in `[Self::shard_timelines]`. Pass in /// a cancellation token at the next scope up (such as a tenant cancellation token) to ensure we respect /// cancellation if there aren't any timelines in the cache. @@ -471,82 +536,21 @@ impl PageServerHandler { flush_r = pgb.flush() => { Ok(flush_r?) }, - _ = self.await_connection_cancelled() => { - Err(QueryError::Shutdown) - } _ = cancel.cancelled() => { Err(QueryError::Shutdown) } ) } - fn copyin_stream<'a, IO>( - &'a self, - pgb: &'a mut PostgresBackend, - cancel: &'a CancellationToken, - ) -> impl Stream> + 'a - where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, - { - async_stream::try_stream! { - loop { - let msg = tokio::select! { - biased; - - _ = cancel.cancelled() => { - // We were requested to shut down. - let msg = "pageserver is shutting down"; - let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None)); - Err(QueryError::Shutdown) - } - - msg = pgb.read_message() => { msg.map_err(QueryError::from)} - }; - - match msg { - Ok(Some(message)) => { - let copy_data_bytes = match message { - FeMessage::CopyData(bytes) => bytes, - FeMessage::CopyDone => { break }, - FeMessage::Sync => continue, - FeMessage::Terminate => { - let msg = "client terminated connection with Terminate message during COPY"; - let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - // error can't happen here, ErrorResponse serialization should be always ok - pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; - Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; - break; - } - m => { - let msg = format!("unexpected message {m:?}"); - // error can't happen here, ErrorResponse serialization should be always ok - pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?; - Err(io::Error::new(io::ErrorKind::Other, msg))?; - break; - } - }; - - yield copy_data_bytes; - } - Ok(None) => { - let msg = "client closed connection during COPY"; - let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - // error can't happen here, ErrorResponse serialization should be always ok - pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; - self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; - Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; - } - Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => { - Err(io_error)?; - } - Err(other) => { - Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?; - } - }; - } - } - } - + /// Pagestream sub-protocol handler. + /// + /// It is a simple request-response protocol inside a COPYBOTH session. + /// + /// # Coding Discipline + /// + /// Coding discipline within this function: all interaction with the `pgb` connection + /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`]. + /// This is so that we can shutdown page_service quickly. #[instrument(skip_all)] async fn handle_pagerequests( &mut self, @@ -561,39 +565,27 @@ impl PageServerHandler { { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); - let tenant = self - .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT) - .await?; - - // Make request tracer if needed - let mut tracer = if tenant.get_trace_read_requests() { - let connection_id = ConnectionId::generate(); - let path = - tenant - .conf - .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id); - Some(Tracer::new(path)) - } else { - None - }; - // switch client to COPYBOTH pgb.write_message_noflush(&BeMessage::CopyBothResponse)?; - self.flush_cancellable(pgb, &tenant.cancel).await?; + tokio::select! { + biased; + _ = self.cancel.cancelled() => { + return Err(QueryError::Shutdown) + } + res = pgb.flush() => { + res?; + } + } loop { + // read request bytes (it's exactly 1 PagestreamFeMessage per CopyData) let msg = tokio::select! { biased; - - _ = self.await_connection_cancelled() => { - // We were requested to shut down. - info!("shutdown request received in page handler"); + _ = self.cancel.cancelled() => { return Err(QueryError::Shutdown) } - msg = pgb.read_message() => { msg } }; - let copy_data_bytes = match msg? { Some(FeMessage::CopyData(bytes)) => bytes, Some(FeMessage::Terminate) => break, @@ -608,18 +600,12 @@ impl PageServerHandler { trace!("query: {copy_data_bytes:?}"); fail::fail_point!("ps::handle-pagerequest-message"); - // Trace request if needed - if let Some(t) = tracer.as_mut() { - t.trace(©_data_bytes) - } - + // parse request let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?; - // TODO: We could create a new per-request context here, with unique ID. - // Currently we use the same per-timeline context for all requests - - let (response, span) = match neon_fe_msg { + // invoke handler function + let (handler_result, span) = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { fail::fail_point!("ps::handle-pagerequest-message::exists"); let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); @@ -673,31 +659,26 @@ impl PageServerHandler { } }; - match response { - Err(PageStreamError::Shutdown) => { - // If we fail to fulfil a request during shutdown, which may be _because_ of - // shutdown, then do not send the error to the client. Instead just drop the - // connection. - span.in_scope(|| info!("dropping connection due to shutdown")); - return Err(QueryError::Shutdown); - } - Err(PageStreamError::Reconnect(reason)) => { - span.in_scope(|| info!("handler requested reconnect: {reason}")); - return Err(QueryError::Reconnect); - } - Err(e) if self.is_connection_cancelled() => { - // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean - // shutdown error, this may be buried inside a PageReconstructError::Other for example. - // - // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet, - // because wait_lsn etc will drop out - // is_stopping(): [`Timeline::flush_and_shutdown`] has entered - // is_canceled(): [`Timeline::shutdown`]` has entered - span.in_scope(|| info!("dropped error response during shutdown: {e:#}")); - return Err(QueryError::Shutdown); - } - r => { - let response_msg = r.unwrap_or_else(|e| { + // Map handler result to protocol behavior. + // Some handler errors cause exit from pagestream protocol. + // Other handler errors are sent back as an error message and we stay in pagestream protocol. + let response_msg = match handler_result { + Err(e) => match &e { + PageStreamError::Shutdown => { + // If we fail to fulfil a request during shutdown, which may be _because_ of + // shutdown, then do not send the error to the client. Instead just drop the + // connection. + span.in_scope(|| info!("dropping connection due to shutdown")); + return Err(QueryError::Shutdown); + } + PageStreamError::Reconnect(reason) => { + span.in_scope(|| info!("handler requested reconnect: {reason}")); + return Err(QueryError::Reconnect); + } + PageStreamError::Read(_) + | PageStreamError::LsnTimeout(_) + | PageStreamError::NotFound(_) + | PageStreamError::BadRequest(_) => { // print the all details to the log with {:#}, but for the client the // error message is enough. Do not log if shutting down, as the anyhow::Error // here includes cancellation which is not an error. @@ -708,138 +689,28 @@ impl PageServerHandler { PagestreamBeMessage::Error(PagestreamErrorResponse { message: e.to_string(), }) - }); + } + }, + Ok(response_msg) => response_msg, + }; - pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; - self.flush_cancellable(pgb, &tenant.cancel).await?; + // marshal & transmit response message + pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; + tokio::select! { + biased; + _ = self.cancel.cancelled() => { + // We were requested to shut down. + info!("shutdown request received in page handler"); + return Err(QueryError::Shutdown) + } + res = pgb.flush() => { + res?; } } } Ok(()) } - #[allow(clippy::too_many_arguments)] - #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))] - async fn handle_import_basebackup( - &self, - pgb: &mut PostgresBackend, - tenant_id: TenantId, - timeline_id: TimelineId, - base_lsn: Lsn, - _end_lsn: Lsn, - pg_version: u32, - ctx: RequestContext, - ) -> Result<(), QueryError> - where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, - { - debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); - - // Create empty timeline - info!("creating new timeline"); - let tenant = self - .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT) - .await?; - let timeline = tenant - .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) - .await?; - - // TODO mark timeline as not ready until it reaches end_lsn. - // We might have some wal to import as well, and we should prevent compute - // from connecting before that and writing conflicting wal. - // - // This is not relevant for pageserver->pageserver migrations, since there's - // no wal to import. But should be fixed if we want to import from postgres. - - // TODO leave clean state on error. For now you can use detach to clean - // up broken state from a failed import. - - // Import basebackup provided via CopyData - info!("importing basebackup"); - pgb.write_message_noflush(&BeMessage::CopyInResponse)?; - self.flush_cancellable(pgb, &tenant.cancel).await?; - - let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel))); - timeline - .import_basebackup_from_tar( - tenant.clone(), - &mut copyin_reader, - base_lsn, - self.broker_client.clone(), - &ctx, - ) - .await?; - - // Read the end of the tar archive. - read_tar_eof(copyin_reader).await?; - - // TODO check checksum - // Meanwhile you can verify client-side by taking fullbackup - // and checking that it matches in size with what was imported. - // It wouldn't work if base came from vanilla postgres though, - // since we discard some log files. - - info!("done"); - Ok(()) - } - - #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))] - async fn handle_import_wal( - &self, - pgb: &mut PostgresBackend, - tenant_id: TenantId, - timeline_id: TimelineId, - start_lsn: Lsn, - end_lsn: Lsn, - ctx: RequestContext, - ) -> Result<(), QueryError> - where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, - { - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - let last_record_lsn = timeline.get_last_record_lsn(); - if last_record_lsn != start_lsn { - return Err(QueryError::Other( - anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")) - ); - } - - // TODO leave clean state on error. For now you can use detach to clean - // up broken state from a failed import. - - // Import wal provided via CopyData - info!("importing wal"); - pgb.write_message_noflush(&BeMessage::CopyInResponse)?; - self.flush_cancellable(pgb, &timeline.cancel).await?; - let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel))); - import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?; - info!("wal import complete"); - - // Read the end of the tar archive. - read_tar_eof(copyin_reader).await?; - - // TODO Does it make sense to overshoot? - if timeline.get_last_record_lsn() < end_lsn { - return Err(QueryError::Other( - anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")) - ); - } - - // Flush data to disk, then upload to s3. No need for a forced checkpoint. - // We only want to persist the data, and it doesn't matter if it's in the - // shape of deltas or images. - info!("flushing layers"); - timeline.freeze_and_flush().await.map_err(|e| match e { - FlushLayerError::Cancelled => QueryError::Shutdown, - other => QueryError::Other(other.into()), - })?; - - info!("done"); - Ok(()) - } - /// Helper function to handle the LSN from client request. /// /// Each GetPage (and Exists and Nblocks) request includes information about @@ -921,7 +792,7 @@ impl PageServerHandler { #[instrument(skip_all, fields(shard_id, %lsn))] async fn handle_make_lsn_lease( - &self, + &mut self, pgb: &mut PostgresBackend, tenant_shard_id: TenantShardId, timeline_id: TimelineId, @@ -931,10 +802,16 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - let shard_selector = ShardSelector::Known(tenant_shard_id.to_index()); let timeline = self - .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector) + .timeline_handles + .get( + tenant_shard_id.tenant_id, + timeline_id, + ShardSelector::Known(tenant_shard_id.to_index()), + ) .await?; + set_tracing_field_shard_id(&timeline); + let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?; let valid_until = lease .valid_until @@ -960,14 +837,17 @@ impl PageServerHandler { req: &PagestreamExistsRequest, ctx: &RequestContext, ) -> Result { - let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; let _timer = timeline .query_metrics .start_timer(metrics::SmgrQueryType::GetRelExists, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - timeline, + &timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -992,7 +872,10 @@ impl PageServerHandler { req: &PagestreamNblocksRequest, ctx: &RequestContext, ) -> Result { - let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; let _timer = timeline .query_metrics @@ -1000,7 +883,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - timeline, + &timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1025,7 +908,10 @@ impl PageServerHandler { req: &PagestreamDbSizeRequest, ctx: &RequestContext, ) -> Result { - let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; let _timer = timeline .query_metrics @@ -1033,7 +919,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - timeline, + &timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1051,122 +937,6 @@ impl PageServerHandler { })) } - /// For most getpage requests, we will already have a Timeline to serve the request: this function - /// looks up such a Timeline synchronously and without touching any global state. - fn get_cached_timeline_for_page( - &mut self, - req: &PagestreamGetPageRequest, - ) -> Result<&Arc, Key> { - let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() { - // Fastest path: single sharded case - if first_idx.shard_count.count() == 1 { - return Ok(&first_timeline.timeline); - } - - let key = rel_block_to_key(req.rel, req.blkno); - let shard_num = first_timeline - .timeline - .get_shard_identity() - .get_shard_number(&key); - - // Fast path: matched the first timeline in our local handler map. This case is common if - // only one shard per tenant is attached to this pageserver. - if first_timeline.timeline.get_shard_identity().number == shard_num { - return Ok(&first_timeline.timeline); - } - - let shard_index = ShardIndex { - shard_number: shard_num, - shard_count: first_timeline.timeline.get_shard_identity().count, - }; - - // Fast-ish path: timeline is in the connection handler's local cache - if let Some(found) = self.shard_timelines.get(&shard_index) { - return Ok(&found.timeline); - } - - key - } else { - rel_block_to_key(req.rel, req.blkno) - }; - - Err(key) - } - - /// Having looked up the [`Timeline`] instance for a particular shard, cache it to enable - /// use in future requests without having to traverse [`crate::tenant::mgr::TenantManager`] - /// again. - /// - /// Note that all the Timelines in this cache are for the same timeline_id: they're differ - /// in which shard they belong to. When we serve a getpage@lsn request, we choose a shard - /// based on key. - /// - /// The typical size of this cache is 1, as we generally create shards to distribute work - /// across pageservers, so don't tend to have multiple shards for the same tenant on the - /// same pageserver. - fn cache_timeline( - &mut self, - timeline: Arc, - ) -> Result<&Arc, GetActiveTimelineError> { - let gate_guard = timeline - .gate - .enter() - .map_err(|_| GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled))?; - - let shard_index = timeline.tenant_shard_id.to_index(); - let entry = self - .shard_timelines - .entry(shard_index) - .or_insert(HandlerTimeline { - timeline, - _guard: gate_guard, - }); - - Ok(&entry.timeline) - } - - /// If [`Self::get_cached_timeline_for_page`] missed, then this function is used to populate the cache with - /// a Timeline to serve requests for this key, if such a Timeline is present on this pageserver. If no such - /// Timeline is found, then we will return an error (this indicates that the client is talking to the wrong node). - async fn load_timeline_for_page( - &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, - key: Key, - ) -> anyhow::Result<&Arc, GetActiveTimelineError> { - // Slow path: we must call out to the TenantManager to find the timeline for this Key - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Page(key)) - .await?; - - self.cache_timeline(timeline) - } - - async fn get_timeline_shard_zero( - &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, - ) -> anyhow::Result<&Arc, GetActiveTimelineError> { - // This is a borrow-checker workaround: we can't return from inside of the `if let Some` because - // that would be an immutable-borrow-self return, whereas later in the function we will use a mutable - // ref to salf. So instead, we first build a bool, and then return while not borrowing self. - let have_cached = if let Some((idx, _tl)) = self.shard_timelines.iter().next() { - idx.shard_number == ShardNumber(0) - } else { - false - }; - - if have_cached { - let entry = self.shard_timelines.iter().next().unwrap(); - Ok(&entry.1.timeline) - } else { - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - Ok(self.cache_timeline(timeline)?) - } - } - #[instrument(skip_all, fields(shard_id))] async fn handle_get_page_at_lsn_request( &mut self, @@ -1175,33 +945,30 @@ impl PageServerHandler { req: &PagestreamGetPageRequest, ctx: &RequestContext, ) -> Result { - let timeline = match self.get_cached_timeline_for_page(req) { - Ok(tl) => { - set_tracing_field_shard_id(tl); - tl - } - Err(key) => { - match self - .load_timeline_for_page(tenant_id, timeline_id, key) - .await - { - Ok(t) => t, - Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { - // We already know this tenant exists in general, because we resolved it at - // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node: the client's knowledge of shard->pageserver - // mapping is out of date. - // - // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via - // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration - // and talk to a different pageserver. - return Err(PageStreamError::Reconnect( - "getpage@lsn request routed to wrong shard".into(), - )); - } - Err(e) => return Err(e.into()), - } + let timeline = match self + .timeline_handles + .get( + tenant_id, + timeline_id, + ShardSelector::Page(rel_block_to_key(req.rel, req.blkno)), + ) + .await + { + Ok(tl) => tl, + Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { + // We already know this tenant exists in general, because we resolved it at + // start of connection. Getting a NotFound here indicates that the shard containing + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + // + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return Err(PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into(), + )); } + Err(e) => return Err(e.into()), }; let _timer = timeline @@ -1210,7 +977,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - timeline, + &timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1235,7 +1002,10 @@ impl PageServerHandler { req: &PagestreamGetSlruSegmentRequest, ctx: &RequestContext, ) -> Result { - let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; let _timer = timeline .query_metrics @@ -1243,7 +1013,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - timeline, + &timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1264,6 +1034,15 @@ impl PageServerHandler { /// Full basebackups should only be used for debugging purposes. /// Originally, it was introduced to enable breaking storage format changes, /// but that is not applicable anymore. + /// + /// # Coding Discipline + /// + /// Coding discipline within this function: all interaction with the `pgb` connection + /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`]. + /// This is so that we can shutdown page_service quickly. + /// + /// TODO: wrap the pgb that we pass to the basebackup handler so that it's sensitive + /// to connection cancellation. #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))] async fn handle_basebackup_request( @@ -1289,10 +1068,11 @@ impl PageServerHandler { let started = std::time::Instant::now(); - // check that the timeline exists let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. @@ -1314,7 +1094,7 @@ impl PageServerHandler { // switch client to COPYOUT pgb.write_message_noflush(&BeMessage::CopyOutResponse) .map_err(QueryError::Disconnected)?; - self.flush_cancellable(pgb, &timeline.cancel).await?; + self.flush_cancellable(pgb, &self.cancel).await?; // Send a tarball of the latest layer on the timeline. Compress if not // fullbackup. TODO Compress in that case too (tests need to be updated) @@ -1405,77 +1185,6 @@ impl PageServerHandler { .expect("claims presence already checked"); check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0)) } - - /// Shorthand for getting a reference to a Timeline of an Active tenant. - async fn get_active_tenant_timeline( - &self, - tenant_id: TenantId, - timeline_id: TimelineId, - selector: ShardSelector, - ) -> Result, GetActiveTimelineError> { - let tenant = self - .get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT) - .await - .map_err(GetActiveTimelineError::Tenant)?; - let timeline = tenant.get_timeline(timeline_id, true)?; - set_tracing_field_shard_id(&timeline); - Ok(timeline) - } - - /// Get a shard's [`Tenant`] in its active state, if present. If we don't find the shard and some - /// slots for this tenant are `InProgress` then we will wait. - /// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait. - /// - /// `timeout` is used as a total timeout for the whole wait operation. - async fn get_active_tenant_with_timeout( - &self, - tenant_id: TenantId, - shard_selector: ShardSelector, - timeout: Duration, - ) -> Result, GetActiveTenantError> { - let wait_start = Instant::now(); - let deadline = wait_start + timeout; - - // Resolve TenantId to TenantShardId. This is usually a quick one-shot thing, the loop is - // for handling the rare case that the slot we're accessing is InProgress. - let tenant_shard = loop { - let resolved = self - .tenant_manager - .resolve_attached_shard(&tenant_id, shard_selector); - match resolved { - ShardResolveResult::Found(tenant_shard) => break tenant_shard, - ShardResolveResult::NotFound => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound( - tenant_id, - ))); - } - ShardResolveResult::InProgress(barrier) => { - // We can't authoritatively answer right now: wait for InProgress state - // to end, then try again - tokio::select! { - _ = self.await_connection_cancelled() => { - return Err(GetActiveTenantError::Cancelled) - }, - _ = barrier.wait() => { - // The barrier completed: proceed around the loop to try looking up again - }, - _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { - return Err(GetActiveTenantError::WaitForActiveTimeout { - latest_state: None, - wait_time: timeout, - }); - } - } - } - }; - }; - - tracing::debug!("Waiting for tenant to enter active state..."); - tenant_shard - .wait_to_become_active(deadline.duration_since(Instant::now())) - .await?; - Ok(tenant_shard) - } } #[async_trait::async_trait] @@ -1554,6 +1263,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::PageStreamV2) + .inc(); + self.handle_pagerequests( pgb, tenant_id, @@ -1579,6 +1292,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::PageStream) + .inc(); + self.handle_pagerequests( pgb, tenant_id, @@ -1605,6 +1322,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::Basebackup) + .inc(); + let lsn = if let Some(lsn_str) = params.get(2) { Some( Lsn::from_str(lsn_str) @@ -1644,48 +1365,6 @@ where metric_recording.observe(&res); res?; } - // return pair of prev_lsn and last_lsn - else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) { - if params.len() != 2 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for get_last_record_rlsn command" - ))); - } - - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - let timeline_id = TimelineId::from_str(params[1]) - .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - - tracing::Span::current() - .record("tenant_id", field::display(tenant_id)) - .record("timeline_id", field::display(timeline_id)); - - self.check_permission(Some(tenant_id))?; - async { - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - - let end_of_timeline = timeline.get_last_record_rlsn(); - - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::text_col(b"prev_lsn"), - RowDescriptor::text_col(b"last_lsn"), - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(end_of_timeline.prev.to_string().as_bytes()), - Some(end_of_timeline.last.to_string().as_bytes()), - ]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - anyhow::Ok(()) - } - .instrument(info_span!( - "handle_get_last_record_lsn", - shard_id = tracing::field::Empty - )) - .await?; - } // same as basebackup, but result includes relational data as well else if let Some(params) = parts.strip_prefix(&["fullbackup"]) { if params.len() < 2 { @@ -1723,6 +1402,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::Fullbackup) + .inc(); + // Check that the timeline exists self.handle_basebackup_request( pgb, @@ -1736,101 +1419,6 @@ where ) .await?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("import basebackup ") { - // Import the `base` section (everything but the wal) of a basebackup. - // Assumes the tenant already exists on this pageserver. - // - // Files are scheduled to be persisted to remote storage, and the - // caller should poll the http api to check when that is done. - // - // Example import command: - // 1. Get start/end LSN from backup_manifest file - // 2. Run: - // cat my_backup/base.tar | psql -h $PAGESERVER \ - // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" - let params = &parts[2..]; - if params.len() != 5 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for import basebackup command" - ))); - } - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - let timeline_id = TimelineId::from_str(params[1]) - .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - let base_lsn = Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; - let end_lsn = Lsn::from_str(params[3]) - .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?; - let pg_version = u32::from_str(params[4]) - .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?; - - tracing::Span::current() - .record("tenant_id", field::display(tenant_id)) - .record("timeline_id", field::display(timeline_id)); - - self.check_permission(Some(tenant_id))?; - - match self - .handle_import_basebackup( - pgb, - tenant_id, - timeline_id, - base_lsn, - end_lsn, - pg_version, - ctx, - ) - .await - { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, - Err(e) => { - error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))? - } - }; - } else if query_string.starts_with("import wal ") { - // Import the `pg_wal` section of a basebackup. - // - // Files are scheduled to be persisted to remote storage, and the - // caller should poll the http api to check when that is done. - let params = &parts[2..]; - if params.len() != 4 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for import wal command" - ))); - } - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - let timeline_id = TimelineId::from_str(params[1]) - .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - let start_lsn = Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; - let end_lsn = Lsn::from_str(params[3]) - .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?; - - tracing::Span::current() - .record("tenant_id", field::display(tenant_id)) - .record("timeline_id", field::display(timeline_id)); - - self.check_permission(Some(tenant_id))?; - - match self - .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx) - .await - { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, - Err(e) => { - error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))? - } - }; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect @@ -1855,6 +1443,10 @@ where self.check_permission(Some(tenant_shard_id.tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::LeaseLsn) + .inc(); + // The caller is responsible for providing correct lsn. let lsn = Lsn::from_str(params[2]) .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; @@ -1872,62 +1464,6 @@ where ))? } }; - } else if let Some(params) = parts.strip_prefix(&["show"]) { - // show - if params.len() != 1 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for config command" - ))); - } - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - - tracing::Span::current().record("tenant_id", field::display(tenant_id)); - - self.check_permission(Some(tenant_id))?; - - let tenant = self - .get_active_tenant_with_timeout( - tenant_id, - ShardSelector::Zero, - ACTIVE_TENANT_TIMEOUT, - ) - .await?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::int8_col(b"checkpoint_distance"), - RowDescriptor::int8_col(b"checkpoint_timeout"), - RowDescriptor::int8_col(b"compaction_target_size"), - RowDescriptor::int8_col(b"compaction_period"), - RowDescriptor::int8_col(b"compaction_threshold"), - RowDescriptor::int8_col(b"gc_horizon"), - RowDescriptor::int8_col(b"gc_period"), - RowDescriptor::int8_col(b"image_creation_threshold"), - RowDescriptor::int8_col(b"pitr_interval"), - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(tenant.get_checkpoint_distance().to_string().as_bytes()), - Some( - tenant - .get_checkpoint_timeout() - .as_secs() - .to_string() - .as_bytes(), - ), - Some(tenant.get_compaction_target_size().to_string().as_bytes()), - Some( - tenant - .get_compaction_period() - .as_secs() - .to_string() - .as_bytes(), - ), - Some(tenant.get_compaction_threshold().to_string().as_bytes()), - Some(tenant.get_gc_horizon().to_string().as_bytes()), - Some(tenant.get_gc_period().as_secs().to_string().as_bytes()), - Some(tenant.get_image_creation_threshold().to_string().as_bytes()), - Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), - ]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { return Err(QueryError::Other(anyhow::anyhow!( "unknown command {query_string}" @@ -1955,7 +1491,7 @@ impl From for QueryError { } #[derive(Debug, thiserror::Error)] -enum GetActiveTimelineError { +pub(crate) enum GetActiveTimelineError { #[error(transparent)] Tenant(GetActiveTenantError), #[error(transparent)] diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 25d00d6dfd..d6e0b82e1d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -15,12 +15,11 @@ use crate::{aux_file, repository::*}; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; -use itertools::Itertools; use pageserver_api::key::{ dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, - AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, + CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; use pageserver_api::keyspace::SparseKeySpace; use pageserver_api::models::AuxFilePolicy; @@ -37,7 +36,6 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, info, trace, warn}; use utils::bin_ser::DeserializeError; use utils::pausable_failpoint; -use utils::vec_map::{VecMap, VecMapOrdering}; use utils::{bin_ser::BeSer, lsn::Lsn}; /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. @@ -174,6 +172,7 @@ impl Timeline { pending_deletions: Vec::new(), pending_nblocks: 0, pending_directory_entries: Vec::new(), + pending_bytes: 0, lsn, } } @@ -284,17 +283,19 @@ impl Timeline { if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { return Ok(true); } + // then check if the database was already initialized. + // get_rel_exists can be called before dbdir is created. + let buf = version.get(self, DBDIR_KEY, ctx).await?; + let dbdirs = DbDirectory::des(&buf)?.dbdirs; + if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) { + return Ok(false); + } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); let buf = version.get(self, key, ctx).await?; - match RelDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => { - let exists = dir.rels.contains(&(tag.relnode, tag.forknum)); - Ok(exists) - } - Err(e) => Err(PageReconstructError::from(e)), - } + let dir = RelDirectory::des(&buf)?; + Ok(dir.rels.contains(&(tag.relnode, tag.forknum))) } /// Get a list of all existing relations in given tablespace and database. @@ -313,20 +314,16 @@ impl Timeline { let key = rel_dir_to_key(spcnode, dbnode); let buf = version.get(self, key, ctx).await?; - match RelDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => { - let rels: HashSet = - HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { - spcnode, - dbnode, - relnode: *relnode, - forknum: *forknum, - })); + let dir = RelDirectory::des(&buf)?; + let rels: HashSet = + HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { + spcnode, + dbnode, + relnode: *relnode, + forknum: *forknum, + })); - Ok(rels) - } - Err(e) => Err(PageReconstructError::from(e)), - } + Ok(rels) } /// Get the whole SLRU segment @@ -388,13 +385,8 @@ impl Timeline { let key = slru_dir_to_key(kind); let buf = version.get(self, key, ctx).await?; - match SlruSegmentDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => { - let exists = dir.segments.contains(&segno); - Ok(exists) - } - Err(e) => Err(PageReconstructError::from(e)), - } + let dir = SlruSegmentDirectory::des(&buf)?; + Ok(dir.segments.contains(&segno)) } /// Locate LSN, such that all transactions that committed before @@ -522,7 +514,7 @@ impl Timeline { ctx: &RequestContext, ) -> Result, PageReconstructError> { let mut max: Option = None; - self.map_all_timestamps(probe_lsn, ctx, |timestamp| { + self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| { if let Some(max_prev) = max { max = Some(max_prev.max(timestamp)); } else { @@ -610,10 +602,7 @@ impl Timeline { let key = slru_dir_to_key(kind); let buf = version.get(self, key, ctx).await?; - match SlruSegmentDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.segments), - Err(e) => Err(PageReconstructError::from(e)), - } + Ok(SlruSegmentDirectory::des(&buf)?.segments) } pub(crate) async fn get_relmap_file( @@ -637,10 +626,7 @@ impl Timeline { // fetch directory entry let buf = self.get(DBDIR_KEY, lsn, ctx).await?; - match DbDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.dbdirs), - Err(e) => Err(PageReconstructError::from(e)), - } + Ok(DbDirectory::des(&buf)?.dbdirs) } pub(crate) async fn get_twophase_file( @@ -662,10 +648,7 @@ impl Timeline { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - match TwoPhaseDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.xids), - Err(e) => Err(PageReconstructError::from(e)), - } + Ok(TwoPhaseDirectory::des(&buf)?.xids) } pub(crate) async fn get_control_file( @@ -690,10 +673,7 @@ impl Timeline { ctx: &RequestContext, ) -> Result, PageReconstructError> { match self.get(AUX_FILES_KEY, lsn, ctx).await { - Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.files), - Err(e) => Err(PageReconstructError::from(e)), - }, + Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files), Err(e) => { // This is expected: historical databases do not have the key. debug!("Failed to get info about AUX files: {}", e); @@ -709,13 +689,14 @@ impl Timeline { ) -> Result, PageReconstructError> { let kv = self .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx) - .await - .context("scan")?; + .await?; let mut result = HashMap::new(); let mut sz = 0; for (_, v) in kv { - let v = v.context("get value")?; - let v = aux_file::decode_file_value_bytes(&v).context("value decode")?; + let v = v?; + let v = aux_file::decode_file_value_bytes(&v) + .context("value decode") + .map_err(PageReconstructError::Other)?; for (fname, content) in v { sz += fname.len(); sz += content.len(); @@ -783,11 +764,10 @@ impl Timeline { ) -> Result, PageReconstructError> { let kv = self .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx) - .await - .context("scan")?; + .await?; let mut result = HashMap::new(); for (k, v) in kv { - let v = v.context("get value")?; + let v = v?; let origin_id = k.field6 as RepOriginId; let origin_lsn = Lsn::des(&v).unwrap(); if origin_lsn != Lsn::INVALID { @@ -854,13 +834,14 @@ impl Timeline { result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn, ctx).await?; - let dbdir = DbDirectory::des(&buf)?; + let dbdir = self.list_dbdirs(lsn, ctx).await?; + let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect(); - let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); - dbs.sort_unstable(); - for (spcnode, dbnode) in dbs { - result.add_key(relmap_file_key(spcnode, dbnode)); + dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b)); + for ((spcnode, dbnode), has_relmap_file) in dbs { + if has_relmap_file { + result.add_key(relmap_file_key(spcnode, dbnode)); + } result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self @@ -919,6 +900,9 @@ impl Timeline { result.add_key(AUX_FILES_KEY); } + // Add extra keyspaces in the test cases. Some test cases write keys into the storage without + // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace` + // and the keys will not be garbage-colllected. #[cfg(test)] { let guard = self.extra_test_dense_keyspace.load(); @@ -927,13 +911,48 @@ impl Timeline { } } - Ok(( - result.to_keyspace(), - /* AUX sparse key space */ - SparseKeySpace(KeySpace { - ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()], - }), - )) + let dense_keyspace = result.to_keyspace(); + let sparse_keyspace = SparseKeySpace(KeySpace { + ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()], + }); + + if cfg!(debug_assertions) { + // Verify if the sparse keyspaces are ordered and non-overlapping. + + // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each + // category of sparse keys are split into their own image/delta files. If there + // are overlapping keyspaces, they will be automatically merged by keyspace accum, + // and we want the developer to keep the keyspaces separated. + + let ranges = &sparse_keyspace.0.ranges; + + // TODO: use a single overlaps_with across the codebase + fn overlaps_with(a: &Range, b: &Range) -> bool { + !(a.end <= b.start || b.end <= a.start) + } + for i in 0..ranges.len() { + for j in 0..i { + if overlaps_with(&ranges[i], &ranges[j]) { + panic!( + "overlapping sparse keyspace: {}..{} and {}..{}", + ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end + ); + } + } + } + for i in 1..ranges.len() { + assert!( + ranges[i - 1].end <= ranges[i].start, + "unordered sparse keyspace: {}..{} and {}..{}", + ranges[i - 1].start, + ranges[i - 1].end, + ranges[i].start, + ranges[i].end + ); + } + } + + Ok((dense_keyspace, sparse_keyspace)) } /// Get cached size of relation if it not updated after specified LSN @@ -1002,21 +1021,33 @@ pub struct DatadirModification<'a> { // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. pending_lsns: Vec, - pending_updates: HashMap>, + pending_updates: HashMap>, pending_deletions: Vec<(Range, Lsn)>, pending_nblocks: i64, /// For special "directory" keys that store key-value maps, track the size of the map /// if it was updated in this modification. pending_directory_entries: Vec<(DirectoryKind, usize)>, + + /// An **approximation** of how large our EphemeralFile write will be when committed. + pending_bytes: usize, } impl<'a> DatadirModification<'a> { + // When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can + // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we + // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed. + pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024; + /// Get the current lsn pub(crate) fn get_lsn(&self) -> Lsn { self.lsn } + pub(crate) fn approx_pending_bytes(&self) -> usize { + self.pending_bytes + } + /// Set the current lsn pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { ensure!( @@ -1684,12 +1715,17 @@ impl<'a> DatadirModification<'a> { // the original code assumes all other errors are missing keys. Therefore, we keep the code path // the same for now, though in theory, we should only match the `MissingKey` variant. Err( - PageReconstructError::Other(_) + e @ (PageReconstructError::Other(_) | PageReconstructError::WalRedo(_) - | PageReconstructError::MissingKey { .. }, + | PageReconstructError::MissingKey(_)), ) => { // Key is missing, we must insert an image as the basis for subsequent deltas. + if !matches!(e, PageReconstructError::MissingKey(_)) { + let e = utils::error::report_compact_sources(&e); + tracing::warn!("treating error as if it was a missing key: {}", e); + } + let mut dir = AuxFilesDirectory { files: HashMap::new(), }; @@ -1744,21 +1780,25 @@ impl<'a> DatadirModification<'a> { // Flush relation and SLRU data blocks, keep metadata. let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); for (key, values) in self.pending_updates.drain() { - for (lsn, value) in values { + let mut write_batch = Vec::new(); + for (lsn, value_ser_size, value) in values { if key.is_rel_block_key() || key.is_slru_block_key() { // This bails out on first error without modifying pending_updates. // That's Ok, cf this function's doc comment. - writer.put(key, lsn, &value, ctx).await?; + write_batch.push((key.to_compact(), lsn, value_ser_size, value)); } else { - retained_pending_updates - .entry(key) - .or_default() - .push((lsn, value)); + retained_pending_updates.entry(key).or_default().push(( + lsn, + value_ser_size, + value, + )); } } + writer.put_batch(write_batch, ctx).await?; } self.pending_updates = retained_pending_updates; + self.pending_bytes = 0; if pending_nblocks != 0 { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); @@ -1784,17 +1824,20 @@ impl<'a> DatadirModification<'a> { self.pending_nblocks = 0; if !self.pending_updates.is_empty() { - // The put_batch call below expects expects the inputs to be sorted by Lsn, - // so we do that first. - let lsn_ordered_batch: VecMap = VecMap::from_iter( - self.pending_updates - .drain() - .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val)))) - .kmerge_by(|lhs, rhs| lhs.0 < rhs.0), - VecMapOrdering::GreaterOrEqual, - ); + // Ordering: the items in this batch do not need to be in any global order, but values for + // a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on + // this to do efficient updates to its index. + let batch: Vec<(CompactKey, Lsn, usize, Value)> = self + .pending_updates + .drain() + .flat_map(|(key, values)| { + values.into_iter().map(move |(lsn, val_ser_size, value)| { + (key.to_compact(), lsn, val_ser_size, value) + }) + }) + .collect::>(); - writer.put_batch(lsn_ordered_batch, ctx).await?; + writer.put_batch(batch, ctx).await?; } if !self.pending_deletions.is_empty() { @@ -1819,6 +1862,8 @@ impl<'a> DatadirModification<'a> { writer.update_directory_entries_count(kind, count as u64); } + self.pending_bytes = 0; + Ok(()) } @@ -1835,7 +1880,7 @@ impl<'a> DatadirModification<'a> { // Note: we don't check pending_deletions. It is an error to request a // value that has been removed, deletion only avoids leaking storage. if let Some(values) = self.pending_updates.get(&key) { - if let Some((_, value)) = values.last() { + if let Some((_, _, value)) = values.last() { return if let Value::Image(img) = value { Ok(img.clone()) } else { @@ -1844,7 +1889,7 @@ impl<'a> DatadirModification<'a> { // work directly with Images, and we never need to read actual // data pages. We could handle this if we had to, by calling // the walredo manager, but let's keep it simple for now. - Err(PageReconstructError::from(anyhow::anyhow!( + Err(PageReconstructError::Other(anyhow::anyhow!( "unexpected pending WAL record" ))) }; @@ -1863,13 +1908,17 @@ impl<'a> DatadirModification<'a> { fn put(&mut self, key: Key, val: Value) { let values = self.pending_updates.entry(key).or_default(); // Replace the previous value if it exists at the same lsn - if let Some((last_lsn, last_value)) = values.last_mut() { + if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() { if *last_lsn == self.lsn { + *last_value_ser_size = val.serialized_size().unwrap() as usize; *last_value = val; return; } } - values.push((self.lsn, val)); + + let val_serialized_size = val.serialized_size().unwrap() as usize; + self.pending_bytes += val_serialized_size; + values.push((self.lsn, val_serialized_size, val)); } fn delete(&mut self, key_range: Range) { @@ -1992,7 +2041,7 @@ mod tests { #[tokio::test] async fn aux_files_round_trip() -> anyhow::Result<()> { let name = "aux_files_round_trip"; - let harness = TenantHarness::create(name)?; + let harness = TenantHarness::create(name).await?; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 5a334d0290..e4ebafd927 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -8,8 +8,7 @@ use std::time::Duration; pub use pageserver_api::key::{Key, KEY_SIZE}; /// A 'value' stored for a one Key. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(test, derive(PartialEq))] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum Value { /// An Image value contains a full copy of the value Image(Bytes), diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 45a516566f..ede1791afa 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -56,7 +56,6 @@ impl Statvfs { } pub mod mock { - use anyhow::Context; use camino::Utf8Path; use regex::Regex; use tracing::log::info; @@ -135,14 +134,30 @@ pub mod mock { { continue; } - total += entry - .metadata() - .with_context(|| format!("get metadata of {:?}", entry.path()))? - .len(); + let m = match entry.metadata() { + Ok(m) => m, + Err(e) if is_not_found(&e) => { + // some temp file which got removed right as we are walking + continue; + } + Err(e) => { + return Err(anyhow::Error::new(e) + .context(format!("get metadata of {:?}", entry.path()))) + } + }; + total += m.len(); } Ok(total) } + fn is_not_found(e: &walkdir::Error) -> bool { + let Some(io_error) = e.io_error() else { + return false; + }; + let kind = io_error.kind(); + matches!(kind, std::io::ErrorKind::NotFound) + } + pub struct Statvfs { pub blocks: u64, pub blocks_available: u64, diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 5f46ce3d69..ed9e001fd2 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -393,7 +393,7 @@ struct PageServerTask { /// Tasks may optionally be launched for a particular tenant/timeline, enabling /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`] - tenant_shard_id: Option, + tenant_shard_id: TenantShardId, timeline_id: Option, mutable: Mutex, @@ -405,10 +405,9 @@ struct PageServerTask { pub fn spawn( runtime: &tokio::runtime::Handle, kind: TaskKind, - tenant_shard_id: Option, + tenant_shard_id: TenantShardId, timeline_id: Option, name: &str, - shutdown_process_on_error: bool, future: F, ) -> PageserverTaskId where @@ -437,7 +436,6 @@ where task_id, task_cloned, cancel, - shutdown_process_on_error, future, )); task_mut.join_handle = Some(join_handle); @@ -454,82 +452,78 @@ async fn task_wrapper( task_id: u64, task: Arc, shutdown_token: CancellationToken, - shutdown_process_on_error: bool, future: F, ) where F: Future> + Send + 'static, { debug!("Starting task '{}'", task_name); - let result = SHUTDOWN_TOKEN - .scope( - shutdown_token, - CURRENT_TASK.scope(task, { - // We use AssertUnwindSafe here so that the payload function - // doesn't need to be UnwindSafe. We don't do anything after the - // unwinding that would expose us to unwind-unsafe behavior. - AssertUnwindSafe(future).catch_unwind() - }), - ) - .await; - task_finish(result, task_name, task_id, shutdown_process_on_error).await; -} - -async fn task_finish( - result: std::result::Result< - anyhow::Result<()>, - std::boxed::Box, - >, - task_name: String, - task_id: u64, - shutdown_process_on_error: bool, -) { - // Remove our entry from the global hashmap. - let task = TASKS - .lock() - .unwrap() - .remove(&task_id) - .expect("no task in registry"); - - let mut shutdown_process = false; - { + // wrap the future so we log panics and errors + let tenant_shard_id = task.tenant_shard_id; + let timeline_id = task.timeline_id; + let fut = async move { + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + let result = AssertUnwindSafe(future).catch_unwind().await; match result { Ok(Ok(())) => { debug!("Task '{}' exited normally", task_name); } Ok(Err(err)) => { - if shutdown_process_on_error { - error!( - "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - shutdown_process = true; - } else { - error!( - "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - } + error!( + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, tenant_shard_id, timeline_id, err + ); } Err(err) => { - if shutdown_process_on_error { - error!( - "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - shutdown_process = true; - } else { - error!( - "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - } + error!( + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, tenant_shard_id, timeline_id, err + ); } } - } + }; - if shutdown_process { - std::process::exit(1); + // add the task-locals + let fut = CURRENT_TASK.scope(task, fut); + let fut = SHUTDOWN_TOKEN.scope(shutdown_token, fut); + + // poll future to completion + fut.await; + + // Remove our entry from the global hashmap. + TASKS + .lock() + .unwrap() + .remove(&task_id) + .expect("no task in registry"); +} + +pub async fn exit_on_panic_or_error( + task_name: &'static str, + future: impl Future>, +) -> T +where + E: std::fmt::Debug, +{ + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + let result = AssertUnwindSafe(future).catch_unwind().await; + match result { + Ok(Ok(val)) => val, + Ok(Err(err)) => { + error!( + task_name, + "Task exited with error, exiting process: {err:?}" + ); + std::process::exit(1); + } + Err(panic_obj) => { + error!(task_name, "Task panicked, exiting process: {panic_obj:?}"); + std::process::exit(1); + } } } @@ -556,7 +550,7 @@ pub async fn shutdown_tasks( let tasks = TASKS.lock().unwrap(); for task in tasks.values() { if (kind.is_none() || Some(task.kind) == kind) - && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id) + && (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id) && (timeline_id.is_none() || task.timeline_id == timeline_id) { task.cancel.cancel(); @@ -579,13 +573,8 @@ pub async fn shutdown_tasks( }; if let Some(mut join_handle) = join_handle { if log_all { - if tenant_shard_id.is_none() { - // there are quite few of these - info!(name = task.name, kind = ?task_kind, "stopping global task"); - } else { - // warn to catch these in tests; there shouldn't be any - warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); - } + // warn to catch these in tests; there shouldn't be any + warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); } if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle) .await diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ca5765c99b..65a7504b74 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -21,6 +21,7 @@ use futures::FutureExt; use futures::StreamExt; use pageserver_api::models; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::TimelineArchivalState; use pageserver_api::models::TimelineState; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::WalRedoManagerStatus; @@ -30,7 +31,9 @@ use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeoutOrCancel; +use std::collections::BTreeMap; use std::fmt; +use std::sync::Weak; use std::time::SystemTime; use storage_broker::BrokerClientChannel; use tokio::io::BufReader; @@ -38,7 +41,9 @@ use tokio::sync::watch; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; +use upload_queue::NotInitialized; use utils::backoff; +use utils::circuit_breaker::CircuitBreaker; use utils::completion; use utils::crashsafe::path_with_suffix_extension; use utils::failpoint_support; @@ -55,11 +60,9 @@ use self::config::AttachedLocationConfig; use self::config::AttachmentMode; use self::config::LocationConf; use self::config::TenantConf; -use self::delete::DeleteTenantFlow; use self::metadata::TimelineMetadata; use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; -use self::mgr::TenantsMap; use self::remote_timeline_client::upload::upload_index_part; use self::remote_timeline_client::RemoteTimelineClient; use self::timeline::uninit::TimelineCreateGuard; @@ -75,9 +78,11 @@ use crate::deletion_queue::DeletionQueueClient; use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; +use crate::l0_flush::L0FlushGlobalState; use crate::metrics::TENANT; use crate::metrics::{ - remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, + remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, + TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, }; use crate::repository::GcResult; use crate::task_mgr; @@ -90,18 +95,16 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; +use crate::walredo; use crate::InitializationOrder; use std::collections::hash_map::Entry; -use std::collections::BTreeSet; use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Debug; use std::fmt::Display; use std::fs; use std::fs::File; -use std::ops::Bound::Included; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use std::sync::Mutex; use std::time::{Duration, Instant}; @@ -137,7 +140,6 @@ pub mod remote_timeline_client; pub mod storage_layer; pub mod config; -pub mod delete; pub mod mgr; pub mod secondary; pub mod tasks; @@ -147,6 +149,7 @@ pub(crate) mod timeline; pub mod size; +mod gc_block; pub(crate) mod throttle; pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; @@ -161,8 +164,6 @@ pub const TENANTS_SEGMENT_NAME: &str = "tenants"; /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; -pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted"; - /// References to shared objects that are passed into each tenant, such /// as the shared remote storage client and process initialization state. #[derive(Clone)] @@ -170,6 +171,7 @@ pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, + pub l0_flush_global_state: L0FlushGlobalState, } /// A [`Tenant`] is really an _attached_ tenant. The configuration @@ -207,7 +209,6 @@ struct TimelinePreload { } pub(crate) struct TenantPreload { - deleting: bool, timelines: HashMap, } @@ -218,8 +219,6 @@ pub(crate) enum SpawnMode { Eager, /// Lazy activation in the background, with the option to skip the queue if the need comes up Lazy, - /// Tenant has been created during the lifetime of this process - Create, } /// @@ -281,13 +280,15 @@ pub struct Tenant { eviction_task_tenant_state: tokio::sync::Mutex, + /// Track repeated failures to compact, so that we can back off. + /// Overhead of mutex is acceptable because compaction is done with a multi-second period. + compaction_circuit_breaker: std::sync::Mutex, + /// If the tenant is in Activating state, notify this to encourage it /// to proceed to Active as soon as possible, rather than waiting for lazy /// background warmup. pub(crate) activate_now_sem: tokio::sync::Semaphore, - pub(crate) delete_progress: Arc>, - // Cancellation token fires when we have entered shutdown(). This is a parent of // Timelines' cancellation token. pub(crate) cancel: CancellationToken, @@ -301,8 +302,20 @@ pub struct Tenant { pub(crate) timeline_get_throttle: Arc>, - /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. + /// An ongoing timeline detach concurrency limiter. + /// + /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense + /// to have two running at the same time. A different one can be started if an earlier one + /// has failed for whatever reason. ongoing_timeline_detach: std::sync::Mutex>, + + /// `index_part.json` based gc blocking reason tracking. + /// + /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before + /// proceeding. + pub(crate) gc_block: gc_block::GcBlock, + + l0_flush_global_state: L0FlushGlobalState, } impl std::fmt::Debug for Tenant { @@ -312,14 +325,66 @@ impl std::fmt::Debug for Tenant { } pub(crate) enum WalRedoManager { - Prod(PostgresRedoManager), + Prod(WalredoManagerId, PostgresRedoManager), #[cfg(test)] Test(harness::TestRedoManager), } -impl From for WalRedoManager { - fn from(mgr: PostgresRedoManager) -> Self { - Self::Prod(mgr) +#[derive(thiserror::Error, Debug)] +#[error("pageserver is shutting down")] +pub(crate) struct GlobalShutDown; + +impl WalRedoManager { + pub(crate) fn new(mgr: PostgresRedoManager) -> Result, GlobalShutDown> { + let id = WalredoManagerId::next(); + let arc = Arc::new(Self::Prod(id, mgr)); + let mut guard = WALREDO_MANAGERS.lock().unwrap(); + match &mut *guard { + Some(map) => { + map.insert(id, Arc::downgrade(&arc)); + Ok(arc) + } + None => Err(GlobalShutDown), + } + } +} + +impl Drop for WalRedoManager { + fn drop(&mut self) { + match self { + Self::Prod(id, _) => { + let mut guard = WALREDO_MANAGERS.lock().unwrap(); + if let Some(map) = &mut *guard { + map.remove(id).expect("new() registers, drop() unregisters"); + } + } + #[cfg(test)] + Self::Test(_) => { + // Not applicable to test redo manager + } + } + } +} + +/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down +/// the walredo processes outside of the regular order. +/// +/// This is necessary to work around a systemd bug where it freezes if there are +/// walredo processes left => +#[allow(clippy::type_complexity)] +pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy< + Mutex>>>, +> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new()))); +#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)] +pub(crate) struct WalredoManagerId(u64); +impl WalredoManagerId { + pub fn next() -> Self { + static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); + let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + if id == 0 { + panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique"); + } + Self(id) } } @@ -331,9 +396,20 @@ impl From for WalRedoManager { } impl WalRedoManager { + pub(crate) async fn shutdown(&self) -> bool { + match self { + Self::Prod(_, mgr) => mgr.shutdown().await, + #[cfg(test)] + Self::Test(_) => { + // Not applicable to test redo manager + true + } + } + } + pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) { match self { - Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout), + Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout), #[cfg(test)] Self::Test(_) => { // Not applicable to test redo manager @@ -351,9 +427,9 @@ impl WalRedoManager { base_img: Option<(Lsn, bytes::Bytes)>, records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>, pg_version: u32, - ) -> anyhow::Result { + ) -> Result { match self { - Self::Prod(mgr) => { + Self::Prod(_, mgr) => { mgr.request_redo(key, lsn, base_img, records, pg_version) .await } @@ -367,7 +443,7 @@ impl WalRedoManager { pub(crate) fn status(&self) -> Option { match self { - WalRedoManager::Prod(m) => Some(m.status()), + WalRedoManager::Prod(_, m) => Some(m.status()), #[cfg(test)] WalRedoManager::Test(_) => None, } @@ -376,6 +452,8 @@ impl WalRedoManager { #[derive(Debug, thiserror::Error, PartialEq, Eq)] pub enum GetTimelineError { + #[error("Timeline is shutting down")] + ShuttingDown, #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")] NotActive { tenant_id: TenantShardId, @@ -528,6 +606,30 @@ impl From for GcError { } } +impl From for GcError { + fn from(value: NotInitialized) -> Self { + match value { + NotInitialized::Uninitialized => GcError::Remote(value.into()), + NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled, + } + } +} + +impl From for GcError { + fn from(_: timeline::layer_manager::Shutdown) -> Self { + GcError::TimelineCancelled + } +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum LoadConfigError { + #[error("TOML deserialization error: '{0}'")] + DeserializeToml(#[from] toml_edit::de::Error), + + #[error("Config not found at {0}")] + NotFound(Utf8PathBuf), +} + impl Tenant { /// Yet another helper for timeline initialization. /// @@ -628,6 +730,7 @@ impl Tenant { .read() .await .layer_map() + .expect("currently loading, layer manager cannot be shutdown already") .iter_historic_layers() .next() .is_some(), @@ -654,19 +757,17 @@ impl Tenant { attached_conf: AttachedTenantConf, shard_identity: ShardIdentity, init_order: Option, - tenants: &'static std::sync::RwLock, mode: SpawnMode, ctx: &RequestContext, - ) -> anyhow::Result> { - let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, - tenant_shard_id, - ))); + ) -> Result, GlobalShutDown> { + let wal_redo_manager = + WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?; let TenantSharedResources { broker_client, remote_storage, deletion_queue_client, + l0_flush_global_state, } = resources; let attach_mode = attached_conf.location.attach_mode; @@ -681,6 +782,7 @@ impl Tenant { tenant_shard_id, remote_storage.clone(), deletion_queue_client, + l0_flush_global_state, )); // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if @@ -696,10 +798,9 @@ impl Tenant { task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::Attach, - Some(tenant_shard_id), + tenant_shard_id, None, "attach tenant", - false, async move { info!( @@ -736,9 +837,9 @@ impl Tenant { // The Stopping case is for when we have passed control on to DeleteTenantFlow: // if it errors, we will call make_broken when tenant is already in Stopping. assert!( - matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), - "the attach task owns the tenant state until activation is complete" - ); + matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), + "the attach task owns the tenant state until activation is complete" + ); *state = TenantState::broken_from_reason(err.to_string()); }); @@ -806,9 +907,6 @@ impl Tenant { }; let preload = match &mode { - SpawnMode::Create => { - None - }, SpawnMode::Eager | SpawnMode::Lazy => { let _preload_timer = TENANT.preload.start_timer(); let res = tenant_clone @@ -828,59 +926,10 @@ impl Tenant { // Remote preload is complete. drop(remote_load_completion); - let pending_deletion = { - match DeleteTenantFlow::should_resume_deletion( - conf, - preload.as_ref().map(|p| p.deleting).unwrap_or(false), - &tenant_clone, - ) - .await - { - Ok(should_resume_deletion) => should_resume_deletion, - Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error); - return Ok(()); - } - } - }; - - info!("pending_deletion {}", pending_deletion.is_some()); - - if let Some(deletion) = pending_deletion { - // as we are no longer loading, signal completion by dropping - // the completion while we resume deletion - drop(_completion); - let background_jobs_can_start = - init_order.as_ref().map(|x| &x.background_jobs_can_start); - if let Some(background) = background_jobs_can_start { - info!("waiting for backgound jobs barrier"); - background.clone().wait().await; - info!("ready for backgound jobs barrier"); - } - - let deleted = DeleteTenantFlow::resume_from_attach( - deletion, - &tenant_clone, - preload, - tenants, - &ctx, - ) - .await; - - if let Err(e) = deleted { - make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); - } - - return Ok(()); - } - // We will time the duration of the attach phase unless this is a creation (attach will do no work) let attached = { - let _attach_timer = match mode { - SpawnMode::Create => None, - SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()), - }; - tenant_clone.attach(preload, mode, &ctx).await + let _attach_timer = Some(TENANT.attach.start_timer()); + tenant_clone.attach(preload, &ctx).await }; match attached { @@ -931,21 +980,13 @@ impl Tenant { ) .await?; - let deleting = other_keys.contains(TENANT_DELETED_MARKER_FILE_NAME); - info!( - "found {} timelines, deleting={}", - remote_timeline_ids.len(), - deleting - ); + info!("found {} timelines", remote_timeline_ids.len(),); for k in other_keys { - if k != TENANT_DELETED_MARKER_FILE_NAME { - warn!("Unexpected non timeline key {k}"); - } + warn!("Unexpected non timeline key {k}"); } Ok(TenantPreload { - deleting, timelines: Self::load_timeline_metadata( self, remote_timeline_ids, @@ -964,22 +1005,14 @@ impl Tenant { async fn attach( self: &Arc, preload: Option, - mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); failpoint_support::sleep_millis_async!("before-attaching-tenant"); - let preload = match (preload, mode) { - (Some(p), _) => p, - (None, SpawnMode::Create) => TenantPreload { - deleting: false, - timelines: HashMap::new(), - }, - (None, _) => { - anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); - } + let Some(preload) = preload else { + anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); }; let mut timelines_to_resume_deletions = vec![]; @@ -1031,6 +1064,8 @@ impl Tenant { } } + let mut gc_blocks = HashMap::new(); + // For every timeline, download the metadata file, scan the local directory, // and build a layer map that contains an entry for each remote and local // layer file. @@ -1040,6 +1075,16 @@ impl Tenant { .remove(&timeline_id) .expect("just put it in above"); + if let Some(blocking) = index_part.gc_blocking.as_ref() { + // could just filter these away, but it helps while testing + anyhow::ensure!( + !blocking.reasons.is_empty(), + "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons" + ); + let prev = gc_blocks.insert(timeline_id, blocking.reasons); + assert!(prev.is_none()); + } + // TODO again handle early failure self.load_remote_timeline( timeline_id, @@ -1048,6 +1093,7 @@ impl Tenant { TimelineResources { remote_client, timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), }, ctx, ) @@ -1083,6 +1129,8 @@ impl Tenant { // IndexPart is the source of truth. self.clean_up_timelines(&existent_timelines)?; + self.gc_block.set_scanned(gc_blocks); + fail::fail_point!("attach-before-activate", |_| { anyhow::bail!("attach-before-activate"); }); @@ -1211,30 +1259,6 @@ impl Tenant { .await } - /// Create a placeholder Tenant object for a broken tenant - pub fn create_broken_tenant( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - remote_storage: GenericRemoteStorage, - reason: String, - ) -> Arc { - Arc::new(Tenant::new( - TenantState::Broken { - reason, - backtrace: String::new(), - }, - conf, - AttachedTenantConf::try_from(LocationConf::default()).unwrap(), - // Shard identity isn't meaningful for a broken tenant: it's just a placeholder - // to occupy the slot for this TenantShardId. - ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count), - None, - tenant_shard_id, - remote_storage, - DeletionQueueClient::broken(), - )) - } - async fn load_timeline_metadata( self: &Arc, timeline_ids: HashSet, @@ -1298,6 +1322,32 @@ impl Tenant { Ok(timeline_preloads) } + pub(crate) async fn apply_timeline_archival_config( + &self, + timeline_id: TimelineId, + state: TimelineArchivalState, + ) -> anyhow::Result<()> { + let timeline = self + .get_timeline(timeline_id, false) + .context("Cannot apply timeline archival config to inexistent timeline")?; + + let upload_needed = timeline + .remote_client + .schedule_index_upload_for_timeline_archival_state(state)?; + + if upload_needed { + const MAX_WAIT: Duration = Duration::from_secs(10); + let Ok(v) = + tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await + else { + tracing::warn!("reached timeout for waiting on upload queue"); + bail!("reached timeout for upload queue flush"); + }; + v?; + } + Ok(()) + } + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { self.tenant_shard_id } @@ -1441,7 +1491,7 @@ impl Tenant { initdb_lsn: Lsn, pg_version: u32, ctx: &RequestContext, - delta_layer_desc: Vec>, + delta_layer_desc: Vec, image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, ) -> anyhow::Result> { @@ -1626,7 +1676,7 @@ impl Tenant { self: Arc, timeline_id: TimelineId, ) -> Result<(), DeleteTimelineError> { - DeleteTimelineFlow::run(&self, timeline_id, false).await?; + DeleteTimelineFlow::run(&self, timeline_id).await?; Ok(()) } @@ -1671,6 +1721,14 @@ impl Tenant { } } + let _guard = match self.gc_block.start().await { + Ok(guard) => guard, + Err(reasons) => { + info!("Skipping GC: {reasons}"); + return Ok(GcResult::default()); + } + }; + self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx) .await } @@ -1679,21 +1737,23 @@ impl Tenant { /// This function is periodically called by compactor task. /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. + /// + /// Returns whether we have pending compaction task. async fn compaction_iteration( &self, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result<(), timeline::CompactionError> { + ) -> Result { // Don't start doing work during shutdown, or when broken, we do not need those in the logs if !self.is_active() { - return Ok(()); + return Ok(false); } { let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); - return Ok(()); + return Ok(false); } } @@ -1717,14 +1777,36 @@ impl Tenant { timelines_to_compact }; - for (timeline_id, timeline) in &timelines_to_compact { - timeline - .compact(cancel, EnumSet::empty(), ctx) - .instrument(info_span!("compact_timeline", %timeline_id)) - .await?; + // Before doing any I/O work, check our circuit breaker + if self.compaction_circuit_breaker.lock().unwrap().is_broken() { + info!("Skipping compaction due to previous failures"); + return Ok(false); } - Ok(()) + let mut has_pending_task = false; + + for (timeline_id, timeline) in &timelines_to_compact { + has_pending_task |= timeline + .compact(cancel, EnumSet::empty(), ctx) + .instrument(info_span!("compact_timeline", %timeline_id)) + .await + .inspect_err(|e| match e { + timeline::CompactionError::ShuttingDown => (), + timeline::CompactionError::Other(e) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, e); + } + })?; + } + + self.compaction_circuit_breaker + .lock() + .unwrap() + .success(&CIRCUIT_BREAKERS_UNBROKEN); + + Ok(has_pending_task) } // Call through to all timelines to freeze ephemeral layers if needed. Usually @@ -1809,6 +1891,9 @@ impl Tenant { .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); + // Before activation, populate each Timeline's GcInfo with information about its children + self.initialize_gc_info(&timelines_accessor); + // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. tasks::start_background_loops(self, background_jobs_can_start); @@ -1892,9 +1977,15 @@ impl Tenant { // If we're still attaching, fire the cancellation token early to drop out: this // will prevent us flushing, but ensures timely shutdown if some I/O during attach // is very slow. - if matches!(self.current_state(), TenantState::Attaching) { + let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) { self.cancel.cancel(); - } + + // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens + // are children of ours, so their flush loops will have shut down already + timeline::ShutdownMode::Hard + } else { + shutdown_mode + }; match self.set_stopping(shutdown_progress, false, false).await { Ok(()) => {} @@ -1941,6 +2032,10 @@ impl Tenant { tracing::debug!("Waiting for tasks..."); task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await; + if let Some(walredo_mgr) = self.walredo_mgr.as_ref() { + walredo_mgr.shutdown().await; + } + // Wait for any in-flight operations to complete self.gate.close().await; @@ -2215,6 +2310,7 @@ impl Tenant { // Upload an index from the parent: this is partly to provide freshness for the // child tenants that will copy it, and partly for general ease-of-debugging: there will // always be a parent shard index in the same generation as we wrote the child shard index. + tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index"); timeline .remote_client .schedule_index_upload_for_file_changes()?; @@ -2222,12 +2318,14 @@ impl Tenant { // Shut down the timeline's remote client: this means that the indices we write // for child shards will not be invalidated by the parent shard deleting layers. + tracing::info!(timeline_id=%timeline.timeline_id, "Shutting down remote storage client"); timeline.remote_client.shutdown().await; // Download methods can still be used after shutdown, as they don't flow through the remote client's // queue. In principal the RemoteTimelineClient could provide this without downloading it, but this // operation is rare, so it's simpler to just download it (and robustly guarantees that the index // we use here really is the remotely persistent one). + tracing::info!(timeline_id=%timeline.timeline_id, "Downloading index_part from parent"); let result = timeline.remote_client .download_index_file(&self.cancel) .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id)) @@ -2240,6 +2338,7 @@ impl Tenant { }; for child_shard in child_shards { + tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index_part for child {}", child_shard.to_index()); upload_index_part( &self.remote_storage, child_shard, @@ -2403,13 +2502,6 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } - pub fn get_trace_read_requests(&self) -> bool { - let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); - tenant_conf - .trace_read_requests - .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) - } - pub fn get_min_resident_size_override(&self) -> Option { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -2553,7 +2645,12 @@ impl Tenant { tenant_shard_id: TenantShardId, remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, + l0_flush_global_state: L0FlushGlobalState, ) -> Tenant { + debug_assert!( + !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none() + ); + let (state, mut rx) = watch::channel(state); tokio::spawn(async move { @@ -2627,8 +2724,15 @@ impl Tenant { cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), + compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new( + format!("compaction-{tenant_shard_id}"), + 5, + // Compaction can be a very expensive operation, and might leak disk space. It also ought + // to be infallible, as long as remote storage is available. So if it repeatedly fails, + // use an extremely long backoff. + Some(Duration::from_secs(3600 * 24)), + )), activate_now_sem: tokio::sync::Semaphore::new(0), - delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())), cancel: CancellationToken::default(), gate: Gate::default(), timeline_get_throttle: Arc::new(throttle::Throttle::new( @@ -2637,6 +2741,8 @@ impl Tenant { )), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), + gc_block: Default::default(), + l0_flush_global_state, } } @@ -2644,59 +2750,35 @@ impl Tenant { pub(super) fn load_tenant_config( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, - ) -> anyhow::Result { - let legacy_config_path = conf.tenant_config_path(tenant_shard_id); + ) -> Result { let config_path = conf.tenant_location_config_path(tenant_shard_id); - if config_path.exists() { - // New-style config takes precedence - let deserialized = Self::read_config(&config_path)?; - Ok(toml_edit::de::from_document::(deserialized)?) - } else if legacy_config_path.exists() { - // Upgrade path: found an old-style configuration only - let deserialized = Self::read_config(&legacy_config_path)?; - - let mut tenant_conf = TenantConfOpt::default(); - for (key, item) in deserialized.iter() { - match key { - "tenant_config" => { - tenant_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("Failed to parse config from file '{legacy_config_path}' as pageserver config"))?; - } - _ => bail!( - "config file {legacy_config_path} has unrecognized pageserver option '{key}'" - ), - } - } - - // Legacy configs are implicitly in attached state, and do not support sharding - Ok(LocationConf::attached_single( - tenant_conf, - Generation::none(), - &models::ShardParameters::default(), - )) - } else { - // FIXME If the config file is not found, assume that we're attaching - // a detached tenant and config is passed via attach command. - // https://github.com/neondatabase/neon/issues/1555 - // OR: we're loading after incomplete deletion that managed to remove config. - info!( - "tenant config not found in {} or {}", - config_path, legacy_config_path - ); - Ok(LocationConf::default()) - } - } - - fn read_config(path: &Utf8Path) -> anyhow::Result { - info!("loading tenant configuration from {path}"); + info!("loading tenant configuration from {config_path}"); // load and parse file - let config = fs::read_to_string(path) - .with_context(|| format!("Failed to load config from path '{path}'"))?; + let config = fs::read_to_string(&config_path).map_err(|e| { + match e.kind() { + std::io::ErrorKind::NotFound => { + // The config should almost always exist for a tenant directory: + // - When attaching a tenant, the config is the first thing we write + // - When detaching a tenant, we atomically move the directory to a tmp location + // before deleting contents. + // + // The very rare edge case that can result in a missing config is if we crash during attach + // between creating directory and writing config. Callers should handle that as if the + // directory didn't exist. - config - .parse::() - .with_context(|| format!("Failed to parse config from file '{path}' as toml file")) + LoadConfigError::NotFound(config_path) + } + _ => { + // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues + // that we cannot cleanly recover + crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file") + } + } + })?; + + Ok(toml_edit::de::from_str::(&config)?) } #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] @@ -2704,48 +2786,18 @@ impl Tenant { conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, location_conf: &LocationConf, - ) -> anyhow::Result<()> { - let legacy_config_path = conf.tenant_config_path(tenant_shard_id); + ) -> std::io::Result<()> { let config_path = conf.tenant_location_config_path(tenant_shard_id); - Self::persist_tenant_config_at( - tenant_shard_id, - &config_path, - &legacy_config_path, - location_conf, - ) - .await + Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await } #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(super) async fn persist_tenant_config_at( tenant_shard_id: &TenantShardId, config_path: &Utf8Path, - legacy_config_path: &Utf8Path, location_conf: &LocationConf, - ) -> anyhow::Result<()> { - if let LocationMode::Attached(attach_conf) = &location_conf.mode { - // The modern-style LocationConf config file requires a generation to be set. In case someone - // is running a pageserver without the infrastructure to set generations, write out the legacy-style - // config file that only contains TenantConf. - // - // This will eventually be removed in https://github.com/neondatabase/neon/issues/5388 - - if attach_conf.generation.is_none() { - tracing::info!( - "Running without generations, writing legacy-style tenant config file" - ); - Self::persist_tenant_config_legacy( - tenant_shard_id, - legacy_config_path, - &location_conf.tenant_conf, - ) - .await?; - - return Ok(()); - } - } - + ) -> std::io::Result<()> { debug!("persisting tenantconf to {config_path}"); let mut conf_content = r#"# This file contains a specific per-tenant's config. @@ -2754,53 +2806,20 @@ impl Tenant { .to_string(); fail::fail_point!("tenant-config-before-write", |_| { - anyhow::bail!("tenant-config-before-write"); + Err(std::io::Error::new( + std::io::ErrorKind::Other, + "tenant-config-before-write", + )) }); // Convert the config to a toml file. - conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?; + conf_content += + &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed"); let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX); - let tenant_shard_id = *tenant_shard_id; - let config_path = config_path.to_owned(); let conf_content = conf_content.into_bytes(); - VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content) - .await - .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?; - - Ok(()) - } - - #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] - async fn persist_tenant_config_legacy( - tenant_shard_id: &TenantShardId, - target_config_path: &Utf8Path, - tenant_conf: &TenantConfOpt, - ) -> anyhow::Result<()> { - debug!("persisting tenantconf to {target_config_path}"); - - let mut conf_content = r#"# This file contains a specific per-tenant's config. -# It is read in case of pageserver restart. - -[tenant_config] -"# - .to_string(); - - // Convert the config to a toml file. - conf_content += &toml_edit::ser::to_string(&tenant_conf)?; - - let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX); - - let tenant_shard_id = *tenant_shard_id; - let target_config_path = target_config_path.to_owned(); - let conf_content = conf_content.into_bytes(); - VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content) - .await - .with_context(|| { - format!("write tenant {tenant_shard_id} config to {target_config_path}") - })?; - Ok(()) + VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await } // @@ -2909,6 +2928,55 @@ impl Tenant { .await } + /// Populate all Timelines' `GcInfo` with information about their children. We do not set the + /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] + /// + /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion. + fn initialize_gc_info( + &self, + timelines: &std::sync::MutexGuard>>, + ) { + // This function must be called before activation: after activation timeline create/delete operations + // might happen, and this function is not safe to run concurrently with those. + assert!(!self.is_active()); + + // Scan all timelines. For each timeline, remember the timeline ID and + // the branch point where it was created. + let mut all_branchpoints: BTreeMap> = BTreeMap::new(); + timelines.iter().for_each(|(timeline_id, timeline_entry)| { + if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { + let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id)); + } + }); + + // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines + let horizon = self.get_gc_horizon(); + + // Populate each timeline's GcInfo with information about its child branches + for timeline in timelines.values() { + let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + .remove(&timeline.timeline_id) + .unwrap_or_default(); + + branchpoints.sort_by_key(|b| b.0); + + let mut target = timeline.gc_info.write().unwrap(); + + target.retain_lsns = branchpoints; + + let space_cutoff = timeline + .get_last_record_lsn() + .checked_sub(horizon) + .unwrap_or(Lsn(0)); + + target.cutoffs = GcCutoffs { + space: space_cutoff, + time: Lsn::INVALID, + }; + } + } + async fn refresh_gc_info_internal( &self, target_timeline_id: Option, @@ -2931,6 +2999,11 @@ impl Tenant { .cloned() .collect::>(); + if target_timeline_id.is_some() && timelines.is_empty() { + // We were to act on a particular timeline and it wasn't found + return Err(GcError::TimelineNotFound); + } + let mut gc_cutoffs: HashMap = HashMap::with_capacity(timelines.len()); @@ -2953,88 +3026,58 @@ impl Tenant { // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; - // Scan all timelines. For each timeline, remember the timeline ID and - // the branch point where it was created. - let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = { - let timelines = self.timelines.lock().unwrap(); - let mut all_branchpoints = BTreeSet::new(); - let timelines = { - if let Some(target_timeline_id) = target_timeline_id.as_ref() { - if timelines.get(target_timeline_id).is_none() { - return Err(GcError::TimelineNotFound); - } - }; - - timelines - .iter() - .map(|(_timeline_id, timeline_entry)| { - if let Some(ancestor_timeline_id) = - &timeline_entry.get_ancestor_timeline_id() - { - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timeline_id) = target_timeline_id { - if ancestor_timeline_id == &timeline_id { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - // Collect branchpoints for all timelines - else { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - - timeline_entry.clone() - }) - .collect::>() - }; - (all_branchpoints, timelines) - }; - // Ok, we now know all the branch points. // Update the GC information for each timeline. let mut gc_timelines = Vec::with_capacity(timelines.len()); for timeline in timelines { - // If target_timeline is specified, ignore all other timelines + // We filtered the timeline list above if let Some(target_timeline_id) = target_timeline_id { - if timeline.timeline_id != target_timeline_id { - continue; - } + assert_eq!(target_timeline_id, timeline.timeline_id); } - let branchpoints: Vec = all_branchpoints - .range(( - Included((timeline.timeline_id, Lsn(0))), - Included((timeline.timeline_id, Lsn(u64::MAX))), - )) - .map(|&x| x.1) - .collect(); - { let mut target = timeline.gc_info.write().unwrap(); + // Cull any expired leases let now = SystemTime::now(); target.leases.retain(|_, lease| !lease.is_expired(&now)); - match gc_cutoffs.remove(&timeline.timeline_id) { - Some(cutoffs) => { - target.retain_lsns = branchpoints; - target.cutoffs = cutoffs; + timeline + .metrics + .valid_lsn_lease_count_gauge + .set(target.leases.len() as u64); + + // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR + if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { + if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) { + target.within_ancestor_pitr = + timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time; } - None => { - // reasons for this being unavailable: - // - this timeline was created while we were finding cutoffs - // - lsn for timestamp search fails for this timeline repeatedly - // - // in both cases, refreshing the branchpoints is correct. - target.retain_lsns = branchpoints; - } - }; + } + + // Update metrics that depend on GC state + timeline + .metrics + .archival_size + .set(if target.within_ancestor_pitr { + timeline.metrics.current_logical_size_gauge.get() + } else { + 0 + }); + timeline.metrics.pitr_history_size.set( + timeline + .get_last_record_lsn() + .checked_sub(target.cutoffs.time) + .unwrap_or(Lsn(0)) + .0, + ); + + // Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline? + // - this timeline was created while we were finding cutoffs + // - lsn for timestamp search fails for this timeline repeatedly + if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) { + target.cutoffs = cutoffs.clone(); + } } gc_timelines.push(timeline); @@ -3072,7 +3115,7 @@ impl Tenant { dst_id: TimelineId, ancestor_lsn: Option, ctx: &RequestContext, - delta_layer_desc: Vec>, + delta_layer_desc: Vec, image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, ) -> anyhow::Result> { @@ -3456,6 +3499,7 @@ impl Tenant { TimelineResources { remote_client, timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), } } @@ -3692,6 +3736,19 @@ impl Tenant { pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt { self.tenant_conf.load().tenant_conf.clone() } + + /// How much local storage would this tenant like to have? It can cope with + /// less than this (via eviction and on-demand downloads), but this function enables + /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O + /// by keeping important things on local disk. + pub(crate) fn local_storage_wanted(&self) -> u64 { + let mut wanted = 0; + let timelines = self.timelines.lock().unwrap(); + for timeline in timelines.values() { + wanted += timeline.metrics.visible_physical_size_gauge.get(); + } + wanted + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository @@ -3792,6 +3849,7 @@ pub(crate) mod harness { use utils::logging; use crate::deletion_queue::mock::MockDeletionQueue; + use crate::l0_flush::L0FlushConfig; use crate::walredo::apply_neon; use crate::{repository::Key, walrecord::NeonWalRecord}; @@ -3829,7 +3887,6 @@ pub(crate) mod harness { walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), - trace_read_requests: Some(tenant_conf.trace_read_requests), eviction_policy: Some(tenant_conf.eviction_policy), min_resident_size_override: tenant_conf.min_resident_size_override, evictions_low_residence_duration_metric_threshold: Some( @@ -3875,7 +3932,7 @@ pub(crate) mod harness { } impl TenantHarness { - pub fn create_custom( + pub async fn create_custom( test_name: &'static str, tenant_conf: TenantConf, tenant_id: TenantId, @@ -3906,10 +3963,12 @@ pub(crate) mod harness { let remote_fs_dir = conf.workdir.join("localfs"); std::fs::create_dir_all(&remote_fs_dir).unwrap(); let config = RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + storage: RemoteStorageKind::LocalFs { + local_path: remote_fs_dir.clone(), + }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; - let remote_storage = GenericRemoteStorage::from_config(&config).unwrap(); + let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap(); let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone())); Ok(Self { @@ -3924,7 +3983,7 @@ pub(crate) mod harness { }) } - pub fn create(test_name: &'static str) -> anyhow::Result { + pub async fn create(test_name: &'static str) -> anyhow::Result { // Disable automatic GC and compaction to make the unit tests more deterministic. // The tests perform them manually if needed. let tenant_conf = TenantConf { @@ -3941,6 +4000,7 @@ pub(crate) mod harness { shard, Generation::new(0xdeadbeef), ) + .await } pub fn span(&self) -> tracing::Span { @@ -3979,12 +4039,14 @@ pub(crate) mod harness { self.tenant_shard_id, self.remote_storage.clone(), self.deletion_queue.new_client(), + // TODO: ideally we should run all unit tests with both configs + L0FlushGlobalState::new(L0FlushConfig::default()), )); let preload = tenant .preload(&self.remote_storage, CancellationToken::new()) .await?; - tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?; + tenant.attach(Some(preload), ctx).await?; tenant.state.send_replace(TenantState::Active); for timeline in tenant.timelines.lock().unwrap().values() { @@ -4012,7 +4074,7 @@ pub(crate) mod harness { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, _pg_version: u32, - ) -> anyhow::Result { + ) -> Result { let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); if records_neon { // For Neon wal records, we can decode without spawning postgres, so do so. @@ -4046,7 +4108,7 @@ pub(crate) mod harness { #[cfg(test)] mod tests { - use std::collections::BTreeMap; + use std::collections::{BTreeMap, BTreeSet}; use super::*; use crate::keyspace::KeySpaceAccum; @@ -4066,6 +4128,8 @@ mod tests { use storage_layer::PersistentLayerKey; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; + use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; + use timeline::{DeltaLayerTestDesc, GcInfo}; use utils::bin_ser::BeSer; use utils::id::TenantId; @@ -4074,7 +4138,7 @@ mod tests { #[tokio::test] async fn test_basic() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4121,7 +4185,8 @@ mod tests { #[tokio::test] async fn no_duplicate_timelines() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")? + let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines") + .await? .load() .await; let _ = tenant @@ -4153,7 +4218,7 @@ mod tests { async fn test_branch() -> anyhow::Result<()> { use std::str::from_utf8; - let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4275,7 +4340,8 @@ mod tests { #[tokio::test] async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? + TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data") + .await? .load() .await; let tline = tenant @@ -4322,7 +4388,8 @@ mod tests { #[tokio::test] async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")? + TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn") + .await? .load() .await; @@ -4344,7 +4411,7 @@ mod tests { .source() .unwrap() .to_string() - .contains("is earlier than latest GC horizon")); + .contains("is earlier than latest GC cutoff")); } } @@ -4377,7 +4444,8 @@ mod tests { #[tokio::test] async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")? + TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline") + .await? .load() .await; let tline = tenant @@ -4411,7 +4479,7 @@ mod tests { { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], Lsn(0x40)); + assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID)); } // You can read the key from the child branch even though the parent is @@ -4423,10 +4491,13 @@ mod tests { // This needs to traverse to the parent, and fails. let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err(); - assert!(err.to_string().starts_with(&format!( - "Bad state on timeline {}: Broken", - tline.timeline_id - ))); + assert!( + err.to_string().starts_with(&format!( + "bad state on timeline {}: Broken", + tline.timeline_id + )), + "{err}" + ); Ok(()) } @@ -4434,7 +4505,8 @@ mod tests { #[tokio::test] async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")? + TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child") + .await? .load() .await; let tline = tenant @@ -4464,10 +4536,10 @@ mod tests { } #[tokio::test] async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> { - let (tenant, ctx) = - TenantHarness::create("test_parent_keeps_data_forever_after_branching")? - .load() - .await; + let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4505,7 +4577,7 @@ mod tests { #[tokio::test] async fn timeline_load() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load"; - let harness = TenantHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME).await?; { let (tenant, ctx) = harness.load().await; let tline = tenant @@ -4532,7 +4604,7 @@ mod tests { #[tokio::test] async fn timeline_load_with_ancestor() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load_with_ancestor"; - let harness = TenantHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME).await?; // create two timelines { let (tenant, ctx) = harness.load().await; @@ -4580,7 +4652,10 @@ mod tests { #[tokio::test] async fn delta_layer_dumping() -> anyhow::Result<()> { use storage_layer::AsLayerDesc; - let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_layer_dumping") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4588,10 +4663,10 @@ mod tests { let layer_map = tline.layers.read().await; let level0_deltas = layer_map - .layer_map() - .get_level0_deltas()? - .into_iter() - .map(|desc| layer_map.get_from_desc(&desc)) + .layer_map()? + .level0_deltas() + .iter() + .map(|desc| layer_map.get_from_desc(desc)) .collect::>(); assert!(!level0_deltas.is_empty()); @@ -4607,7 +4682,7 @@ mod tests { #[tokio::test] async fn test_images() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_images")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4711,7 +4786,7 @@ mod tests { lsn: Lsn, repeat: usize, key_count: usize, - ) -> anyhow::Result<()> { + ) -> anyhow::Result>> { let compact = true; bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await } @@ -4724,7 +4799,9 @@ mod tests { repeat: usize, key_count: usize, compact: bool, - ) -> anyhow::Result<()> { + ) -> anyhow::Result>> { + let mut inserted: HashMap> = Default::default(); + let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let mut blknum = 0; @@ -4745,6 +4822,7 @@ mod tests { ctx, ) .await?; + inserted.entry(test_key).or_default().insert(lsn); writer.finish_write(lsn); drop(writer); @@ -4769,7 +4847,7 @@ mod tests { assert_eq!(res.layers_removed, 0, "this never removes anything"); } - Ok(()) + Ok(inserted) } // @@ -4778,7 +4856,7 @@ mod tests { // #[tokio::test] async fn test_bulk_insert() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_bulk_insert")?; + let harness = TenantHarness::create("test_bulk_insert").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) @@ -4809,21 +4887,23 @@ mod tests { // so the search can stop at the first delta layer and doesn't traverse any deeper. #[tokio::test] async fn test_get_vectored() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_get_vectored")?; + let harness = TenantHarness::create("test_get_vectored").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; let lsn = Lsn(0x10); - bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; + let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; let guard = tline.layers.read().await; - guard.layer_map().dump(true, &ctx).await?; + let lm = guard.layer_map()?; + + lm.dump(true, &ctx).await?; let mut reads = Vec::new(); let mut prev = None; - guard.layer_map().iter_historic_layers().for_each(|desc| { + lm.iter_historic_layers().for_each(|desc| { if !desc.is_delta() { prev = Some(desc.clone()); return; @@ -4877,9 +4957,39 @@ mod tests { &ctx, ) .await; - tline - .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx) - .await; + + let mut expected_lsns: HashMap = Default::default(); + let mut expect_missing = false; + let mut key = read.start().unwrap(); + while key != read.end().unwrap() { + if let Some(lsns) = inserted.get(&key) { + let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn); + match expected_lsn { + Some(lsn) => { + expected_lsns.insert(key, *lsn); + } + None => { + expect_missing = true; + break; + } + } + } else { + expect_missing = true; + break; + } + + key = key.next(); + } + + if expect_missing { + assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_)))); + } else { + for (key, image) in vectored_res? { + let expected_lsn = expected_lsns.get(&key).expect("determined above"); + let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn)); + assert_eq!(image?, expected_image); + } + } } Ok(()) @@ -4887,7 +4997,7 @@ mod tests { #[tokio::test] async fn test_get_vectored_aux_files() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_get_vectored_aux_files")?; + let harness = TenantHarness::create("test_get_vectored_aux_files").await?; let (tenant, ctx) = harness.load().await; let tline = tenant @@ -4929,10 +5039,6 @@ mod tests { ) .await; - child_timeline - .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx) - .await; - let images = vectored_res?; assert!(images.is_empty()); Ok(()) @@ -4973,7 +5079,8 @@ mod tests { TenantId::generate(), ShardIdentity::unsharded(), Generation::new(0xdeadbeef), - )?; + ) + .await?; let (tenant, ctx) = harness.load().await; let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); @@ -5116,7 +5223,7 @@ mod tests { // ``` #[tokio::test] async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?; + let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?; let (tenant, ctx) = harness.load().await; let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); @@ -5265,7 +5372,7 @@ mod tests { name: &'static str, compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { - let mut harness = TenantHarness::create(name)?; + let mut harness = TenantHarness::create(name).await?; harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { kind: compaction_algorithm, }; @@ -5349,7 +5456,8 @@ mod tests { #[tokio::test] async fn test_traverse_branches() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_traverse_branches")? + let (tenant, ctx) = TenantHarness::create("test_traverse_branches") + .await? .load() .await; let mut tline = tenant @@ -5439,7 +5547,8 @@ mod tests { #[tokio::test] async fn test_traverse_ancestors() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")? + let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors") + .await? .load() .await; let mut tline = tenant @@ -5505,7 +5614,8 @@ mod tests { #[tokio::test] async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")? + let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable") + .await? .load() .await; @@ -5574,7 +5684,7 @@ mod tests { #[tokio::test] async fn test_create_guard_crash() -> anyhow::Result<()> { let name = "test_create_guard_crash"; - let harness = TenantHarness::create(name)?; + let harness = TenantHarness::create(name).await?; { let (tenant, ctx) = harness.load().await; let tline = tenant @@ -5627,7 +5737,7 @@ mod tests { name: &'static str, compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { - let mut harness = TenantHarness::create(name)?; + let mut harness = TenantHarness::create(name).await?; harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { kind: compaction_algorithm, }; @@ -5651,7 +5761,7 @@ mod tests { #[tokio::test] async fn test_metadata_scan() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_scan")?; + let harness = TenantHarness::create("test_metadata_scan").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -5770,7 +5880,7 @@ mod tests { #[tokio::test] async fn test_metadata_compaction_trigger() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_compaction_trigger")?; + let harness = TenantHarness::create("test_metadata_compaction_trigger").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -5799,23 +5909,12 @@ mod tests { tline.freeze_and_flush().await?; // force create a delta layer } - let before_num_l0_delta_files = tline - .layers - .read() - .await - .layer_map() - .get_level0_deltas()? - .len(); + let before_num_l0_delta_files = + tline.layers.read().await.layer_map()?.level0_deltas().len(); tline.compact(&cancel, EnumSet::empty(), &ctx).await?; - let after_num_l0_delta_files = tline - .layers - .read() - .await - .layer_map() - .get_level0_deltas()? - .len(); + let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); @@ -5829,7 +5928,9 @@ mod tests { #[tokio::test] async fn test_branch_copies_dirty_aux_file_flag() { - let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap(); + let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag") + .await + .unwrap(); // the default aux file policy to switch is v1 if not set by the admins assert_eq!( @@ -5931,7 +6032,9 @@ mod tests { #[tokio::test] async fn aux_file_policy_switch() { - let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap(); + let mut harness = TenantHarness::create("aux_file_policy_switch") + .await + .unwrap(); harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode let (tenant, ctx) = harness.load().await; @@ -6105,7 +6208,9 @@ mod tests { #[tokio::test] async fn aux_file_policy_force_switch() { - let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap(); + let mut harness = TenantHarness::create("aux_file_policy_force_switch") + .await + .unwrap(); harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1; let (tenant, ctx) = harness.load().await; @@ -6166,7 +6271,9 @@ mod tests { #[tokio::test] async fn aux_file_policy_auto_detect() { - let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap(); + let mut harness = TenantHarness::create("aux_file_policy_auto_detect") + .await + .unwrap(); harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode let (tenant, ctx) = harness.load().await; @@ -6229,7 +6336,7 @@ mod tests { #[tokio::test] async fn test_metadata_image_creation() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_image_creation")?; + let harness = TenantHarness::create("test_metadata_image_creation").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -6328,7 +6435,7 @@ mod tests { #[tokio::test] async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; + let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?; let (tenant, ctx) = harness.load().await; let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); @@ -6361,27 +6468,6 @@ mod tests { .await .unwrap(); - async fn get_vectored_impl_wrapper( - tline: &Arc, - key: Key, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result, GetVectoredError> { - let mut reconstruct_state = ValuesReconstructState::new(); - let mut res = tline - .get_vectored_impl( - KeySpace::single(key..key.next()), - lsn, - &mut reconstruct_state, - ctx, - ) - .await?; - Ok(res.pop_last().map(|(k, v)| { - assert_eq!(k, key); - v.unwrap() - })) - } - let lsn = Lsn(0x30); // test vectored get on parent timeline @@ -6421,7 +6507,7 @@ mod tests { #[tokio::test] async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; + let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?; let (tenant, ctx) = harness.load().await; let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); @@ -6457,27 +6543,6 @@ mod tests { .await .unwrap(); - async fn get_vectored_impl_wrapper( - tline: &Arc, - key: Key, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result, GetVectoredError> { - let mut reconstruct_state = ValuesReconstructState::new(); - let mut res = tline - .get_vectored_impl( - KeySpace::single(key..key.next()), - lsn, - &mut reconstruct_state, - ctx, - ) - .await?; - Ok(res.pop_last().map(|(k, v)| { - assert_eq!(k, key); - v.unwrap() - })) - } - let lsn = Lsn(0x30); // test vectored get on parent timeline @@ -6534,7 +6599,7 @@ mod tests { #[tokio::test] async fn test_metadata_tombstone_reads() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_tombstone_reads")?; + let harness = TenantHarness::create("test_metadata_tombstone_reads").await?; let (tenant, ctx) = harness.load().await; let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -6553,9 +6618,18 @@ mod tests { &ctx, // delta layers vec![ - vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], - vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], - vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), ], // image layers vec![ @@ -6605,7 +6679,9 @@ mod tests { #[tokio::test] async fn test_metadata_tombstone_image_creation() { - let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap(); + let harness = TenantHarness::create("test_metadata_tombstone_image_creation") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); @@ -6621,17 +6697,29 @@ mod tests { &ctx, // delta layers vec![ - vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], - vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], - vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], - vec![ - (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))), - (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))), - ], + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x30)..Lsn(0x40), + vec![ + (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))), + (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))), + ], + ), ], // image layers vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], - Lsn(0x30), + Lsn(0x40), ) .await .unwrap(); @@ -6654,7 +6742,7 @@ mod tests { // Image layers are created at last_record_lsn let images = tline - .inspect_image_layers(Lsn(0x30), &ctx) + .inspect_image_layers(Lsn(0x40), &ctx) .await .unwrap() .into_iter() @@ -6665,8 +6753,9 @@ mod tests { #[tokio::test] async fn test_metadata_tombstone_empty_image_creation() { - let harness = - TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap(); + let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -6680,9 +6769,18 @@ mod tests { &ctx, // delta layers vec![ - vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], - vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], - vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), ], // image layers vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], @@ -6720,7 +6818,7 @@ mod tests { #[tokio::test] async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?; + let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { @@ -6730,62 +6828,67 @@ mod tests { key } - // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon. + // We create + // - one bottom-most image layer, + // - a delta layer D1 crossing the GC horizon with data below and above the horizon, + // - a delta layer D2 crossing the GC horizon with data only below the horizon, + // - a delta layer D3 above the horizon. // - // | D1 | | D3 | + // | D3 | + // | D1 | // -| |-- gc horizon ----------------- // | | | D2 | // --------- img layer ------------------ // // What we should expact from this compaction is: - // | Part of D1 | | D3 | + // | D3 | + // | Part of D1 | // --------- img layer with D1+D2 at GC horizon------------------ // img layer at 0x10 let img_layer = (0..10) - .map(|id| (get_key(id), test_img(&format!("value {id}@0x10")))) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) .collect_vec(); let delta1 = vec![ - // TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose. ( get_key(1), Lsn(0x20), - Value::Image(test_img("value 1@0x20")), + Value::Image(Bytes::from("value 1@0x20")), ), ( get_key(2), Lsn(0x30), - Value::Image(test_img("value 2@0x30")), + Value::Image(Bytes::from("value 2@0x30")), ), ( get_key(3), Lsn(0x40), - Value::Image(test_img("value 3@0x40")), + Value::Image(Bytes::from("value 3@0x40")), ), ]; let delta2 = vec![ ( get_key(5), Lsn(0x20), - Value::Image(test_img("value 5@0x20")), + Value::Image(Bytes::from("value 5@0x20")), ), ( get_key(6), Lsn(0x20), - Value::Image(test_img("value 6@0x20")), + Value::Image(Bytes::from("value 6@0x20")), ), ]; let delta3 = vec![ ( get_key(8), - Lsn(0x40), - Value::Image(test_img("value 8@0x40")), + Lsn(0x48), + Value::Image(Bytes::from("value 8@0x48")), ), ( get_key(9), - Lsn(0x40), - Value::Image(test_img("value 9@0x40")), + Lsn(0x48), + Value::Image(Bytes::from("value 9@0x48")), ), ]; @@ -6795,7 +6898,11 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, - vec![delta1, delta2, delta3], // delta layers + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) @@ -6803,12 +6910,48 @@ mod tests { { // Update GC info let mut guard = tline.gc_info.write().unwrap(); - guard.cutoffs.pitr = Lsn(0x30); - guard.cutoffs.horizon = Lsn(0x30); + guard.cutoffs.time = Lsn(0x30); + guard.cutoffs.space = Lsn(0x30); + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x20"), + Bytes::from_static(b"value 2@0x30"), + Bytes::from_static(b"value 3@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x20"), + Bytes::from_static(b"value 6@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x48"), + Bytes::from_static(b"value 9@0x48"), + ]; + + for (idx, expected) in expected_result.iter().enumerate() { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + expected + ); } let cancel = CancellationToken::new(); - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + + for (idx, expected) in expected_result.iter().enumerate() { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + expected + ); + } // Check if the image layer at the GC horizon contains exactly what we want let image_at_gc_horizon = tline @@ -6820,14 +6963,22 @@ mod tests { .collect::>(); assert_eq!(image_at_gc_horizon.len(), 10); - let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10]; + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x20"), + Bytes::from_static(b"value 2@0x30"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x20"), + Bytes::from_static(b"value 6@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; for idx in 0..10 { assert_eq!( image_at_gc_horizon[idx], - ( - get_key(idx as u32), - test_img(&format!("value {idx}@{:#x}", expected_lsn[idx])) - ) + (get_key(idx as u32), expected_result[idx].clone()) ); } @@ -6854,31 +7005,47 @@ mod tests { vec![ // Image layer at GC horizon PersistentLayerKey { - key_range: Key::MIN..get_key(10), + key_range: { + let mut key = Key::MAX; + key.field6 -= 1; + Key::MIN..key + }, lsn_range: Lsn(0x30)..Lsn(0x31), is_delta: false }, // The delta layer that is cut in the middle PersistentLayerKey { - key_range: Key::MIN..get_key(9), + key_range: get_key(3)..get_key(4), lsn_range: Lsn(0x30)..Lsn(0x41), is_delta: true }, - // The delta layer we created and should not be picked for the compaction + // The delta3 layer that should not be picked for the compaction PersistentLayerKey { key_range: get_key(8)..get_key(10), - lsn_range: Lsn(0x40)..Lsn(0x41), + lsn_range: Lsn(0x48)..Lsn(0x50), is_delta: true } ] ); + // increase GC horizon and compact again + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.time = Lsn(0x40); + guard.cutoffs.space = Lsn(0x40); + } + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + Ok(()) } #[tokio::test] async fn test_neon_test_record() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_neon_test_record")?; + let harness = TenantHarness::create("test_neon_test_record").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { @@ -6931,7 +7098,10 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, - vec![delta1], // delta layers + vec![DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x40), + delta1, + )], // delta layers vec![(Lsn(0x10), image1)], // image layers Lsn(0x50), ) @@ -6945,6 +7115,9 @@ mod tests { tline.get(get_key(2), Lsn(0x50), &ctx).await?, Bytes::from_static(b"0x10,0x20,0x30") ); + + // Need to remove the limit of "Neon WAL redo requires base image". + // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new()); // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new()); @@ -6953,7 +7126,7 @@ mod tests { #[tokio::test] async fn test_lsn_lease() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await; let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_lsn = Lsn(0x100); @@ -7039,4 +7212,972 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + // We create + // - one bottom-most image layer, + // - a delta layer D1 crossing the GC horizon with data below and above the horizon, + // - a delta layer D2 crossing the GC horizon with data only below the horizon, + // - a delta layer D3 above the horizon. + // + // | D3 | + // | D1 | + // -| |-- gc horizon ----------------- + // | | | D2 | + // --------- img layer ------------------ + // + // What we should expact from this compaction is: + // | D3 | + // | Part of D1 | + // --------- img layer with D1+D2 at GC horizon------------------ + + // img layer at 0x10 + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + ), + ( + get_key(3), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x30), &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + } + + let cancel = CancellationToken::new(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x30), &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + } + + // increase GC horizon and compact again + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.time = Lsn(0x40); + guard.cutoffs.space = Lsn(0x40); + } + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + + Ok(()) + } + + #[tokio::test] + async fn test_generate_key_retention() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_generate_key_retention").await?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + tline.force_advance_lsn(Lsn(0x70)); + let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let history = vec![ + ( + key, + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"0x10")), + ), + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + key, + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ( + key, + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + ), + ( + key, + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + key, + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + key, + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]; + let res = tline + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[Lsn(0x20), Lsn(0x40), Lsn(0x50)], + 3, + None, + ) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![ + ( + Lsn(0x20), + KeyLogAtLsn(vec![( + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20")), + )]), + ), + ( + Lsn(0x40), + KeyLogAtLsn(vec![ + ( + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ]), + ), + ( + Lsn(0x50), + KeyLogAtLsn(vec![( + Lsn(0x50), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40;0x50")), + )]), + ), + ( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + )]), + ), + ], + above_horizon: KeyLogAtLsn(vec![ + ( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]), + }; + assert_eq!(res, expected_res); + + // We expect GC-compaction to run with the original GC. This would create a situation that + // the original GC algorithm removes some delta layers b/c there are full image coverage, + // therefore causing some keys to have an incomplete history below the lowest retain LSN. + // For example, we have + // ```plain + // init delta @ 0x10, image @ 0x20, delta @ 0x30 (gc_horizon), image @ 0x40. + // ``` + // Now the GC horizon moves up, and we have + // ```plain + // init delta @ 0x10, image @ 0x20, delta @ 0x30, image @ 0x40 (gc_horizon) + // ``` + // The original GC algorithm kicks in, and removes delta @ 0x10, image @ 0x20. + // We will end up with + // ```plain + // delta @ 0x30, image @ 0x40 (gc_horizon) + // ``` + // Now we run the GC-compaction, and this key does not have a full history. + // We should be able to handle this partial history and drop everything before the + // gc_horizon image. + + let history = vec![ + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + key, + Lsn(0x40), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), + ), + ( + key, + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + ), + ( + key, + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + key, + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + key, + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]; + let res = tline + .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![ + ( + Lsn(0x40), + KeyLogAtLsn(vec![( + Lsn(0x40), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), + )]), + ), + ( + Lsn(0x50), + KeyLogAtLsn(vec![( + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + )]), + ), + ( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + )]), + ), + ], + above_horizon: KeyLogAtLsn(vec![ + ( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]), + }; + assert_eq!(res, expected_res); + + // In case of branch compaction, the branch itself does not have the full history, and we need to provide + // the ancestor image in the test case. + + let history = vec![ + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + key, + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ]; + let res = tline + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[], + 3, + Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + ) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page + )]), + )], + above_horizon: KeyLogAtLsn(vec![( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + )]), + }; + assert_eq!(res, expected_res); + + let history = vec![ + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ( + key, + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ]; + let res = tline + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[Lsn(0x30)], + 3, + Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + ) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![ + ( + Lsn(0x30), + KeyLogAtLsn(vec![( + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + )]), + ), + ( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")), + )]), + ), + ], + above_horizon: KeyLogAtLsn(vec![( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + )]), + }; + assert_eq!(res, expected_res); + + Ok(()) + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> { + let harness = + TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + ), + ( + get_key(3), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![ + (Lsn(0x10), tline.timeline_id), + (Lsn(0x20), tline.timeline_id), + ], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_20 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_10 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let verify_result = || async { + let gc_horizon = { + let gc_info = tline.gc_info.read().unwrap(); + gc_info.cutoffs.time + }; + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), gc_horizon, &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x20), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_20[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x10), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_10[idx] + ); + } + }; + + verify_result().await; + + let cancel = CancellationToken::new(); + let mut dryrun_flags = EnumSet::new(); + dryrun_flags.insert(CompactFlags::DryRun); + + tline + .compact_with_gc(&cancel, dryrun_flags, &ctx) + .await + .unwrap(); + // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs + // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests. + verify_result().await; + + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + // compact again + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + // increase GC horizon and compact again + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.time = Lsn(0x38); + guard.cutoffs.space = Lsn(0x38); + } + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result + + // not increasing the GC horizon and compact again + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + Ok(()) + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + ), + ( + get_key(3), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let parent_tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![], // delta layers + vec![(Lsn(0x18), img_layer)], // image layers + Lsn(0x18), + ) + .await?; + + parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10))); + + let branch_tline = tenant + .branch_timeline_test_with_layers( + &parent_tline, + NEW_TIMELINE_ID, + Some(Lsn(0x18)), + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers + vec![], // image layers + Lsn(0x50), + ) + .await?; + + branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10))); + + { + // Update GC info + let mut guard = parent_tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)], + cutoffs: GcCutoffs { + time: Lsn(0x10), + space: Lsn(0x10), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + { + // Update GC info + let mut guard = branch_tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)], + cutoffs: GcCutoffs { + time: Lsn(0x50), + space: Lsn(0x50), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_lsn_40 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let verify_result = || async { + for idx in 0..10 { + assert_eq!( + branch_tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + assert_eq!( + branch_tline + .get(get_key(idx as u32), Lsn(0x40), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_40[idx] + ); + } + }; + + verify_result().await; + + let cancel = CancellationToken::new(); + branch_tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + + verify_result().await; + + Ok(()) + } } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 2be8816cef..a245c99a88 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -6,21 +6,35 @@ //! is written as a one byte. If it's larger than that, the length //! is written as a four-byte integer, in big-endian, with the high //! bit set. This way, we can detect whether it's 1- or 4-byte header -//! by peeking at the first byte. +//! by peeking at the first byte. For blobs larger than 128 bits, +//! we also specify three reserved bits, only one of the three bit +//! patterns is currently in use (0b011) and signifies compression +//! with zstd. //! //! len < 128: 0XXXXXXX -//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX +//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! +use async_compression::Level; use bytes::{BufMut, BytesMut}; +use pageserver_api::models::ImageCompressionAlgorithm; +use tokio::io::AsyncWriteExt; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use tracing::warn; use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; use crate::tenant::block_io::BlockCursor; +use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::VirtualFile; use std::cmp::min; use std::io::{Error, ErrorKind}; +#[derive(Copy, Clone, Debug)] +pub struct CompressionInfo { + pub written_compressed: bool, + pub compressed_size: Option, +} + impl<'a> BlockCursor<'a> { /// Read a blob into a new buffer. pub async fn read_blob( @@ -66,12 +80,37 @@ impl<'a> BlockCursor<'a> { len_buf.copy_from_slice(&buf[off..off + 4]); off += 4; } - len_buf[0] &= 0x7f; + let bit_mask = if self.read_compressed { + !LEN_COMPRESSION_BIT_MASK + } else { + 0x7f + }; + len_buf[0] &= bit_mask; u32::from_be_bytes(len_buf) as usize }; + let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK; - dstbuf.clear(); - dstbuf.reserve(len); + let mut tmp_buf = Vec::new(); + let buf_to_write; + let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed { + if compression_bits > BYTE_UNCOMPRESSED { + warn!("reading key above future limit ({len} bytes)"); + } + buf_to_write = dstbuf; + None + } else if compression_bits == BYTE_ZSTD { + buf_to_write = &mut tmp_buf; + Some(dstbuf) + } else { + let error = std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("invalid compression byte {compression_bits:x}"), + ); + return Err(error); + }; + + buf_to_write.clear(); + buf_to_write.reserve(len); // Read the payload let mut remain = len; @@ -85,14 +124,35 @@ impl<'a> BlockCursor<'a> { page_remain = PAGE_SZ; } let this_blk_len = min(remain, page_remain); - dstbuf.extend_from_slice(&buf[off..off + this_blk_len]); + buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]); remain -= this_blk_len; off += this_blk_len; } + + if let Some(dstbuf) = compression { + if compression_bits == BYTE_ZSTD { + let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf); + decoder.write_all(buf_to_write).await?; + decoder.flush().await?; + } else { + unreachable!("already checked above") + } + } + Ok(()) } } +/// Reserved bits for length and compression +pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0; + +/// The maximum size of blobs we support. The highest few bits +/// are reserved for compression and other further uses. +const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff; + +pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80; +pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10; + /// A wrapper of `VirtualFile` that allows users to write blobs. /// /// If a `BlobWriter` is dropped, the internal buffer will be @@ -127,11 +187,11 @@ impl BlobWriter { /// You need to make sure that the internal buffer is empty, otherwise /// data will be written in wrong order. #[inline(always)] - async fn write_all_unbuffered, Buf: IoBuf + Send>( + async fn write_all_unbuffered( &mut self, - src_buf: B, + src_buf: FullSlice, ctx: &RequestContext, - ) -> (B::Buf, Result<(), Error>) { + ) -> (FullSlice, Result<(), Error>) { let (src_buf, res) = self.inner.write_all(src_buf, ctx).await; let nbytes = match res { Ok(nbytes) => nbytes, @@ -145,8 +205,9 @@ impl BlobWriter { /// Flushes the internal buffer to the underlying `VirtualFile`. pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> { let buf = std::mem::take(&mut self.buf); - let (mut buf, res) = self.inner.write_all(buf, ctx).await; + let (slice, res) = self.inner.write_all(buf.slice_len(), ctx).await; res?; + let mut buf = slice.into_raw_slice().into_inner(); buf.clear(); self.buf = buf; Ok(()) @@ -163,19 +224,30 @@ impl BlobWriter { } /// Internal, possibly buffered, write function - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - src_buf: B, + src_buf: FullSlice, ctx: &RequestContext, - ) -> (B::Buf, Result<(), Error>) { + ) -> (FullSlice, Result<(), Error>) { + let src_buf = src_buf.into_raw_slice(); + let src_buf_bounds = src_buf.bounds(); + let restore = move |src_buf_slice: Slice<_>| { + FullSlice::must_new(Slice::from_buf_bounds( + src_buf_slice.into_inner(), + src_buf_bounds, + )) + }; + if !BUFFERED { assert!(self.buf.is_empty()); - return self.write_all_unbuffered(src_buf, ctx).await; + return self + .write_all_unbuffered(FullSlice::must_new(src_buf), ctx) + .await; } let remaining = Self::CAPACITY - self.buf.len(); let src_buf_len = src_buf.bytes_init(); if src_buf_len == 0 { - return (Slice::into_inner(src_buf.slice_full()), Ok(())); + return (restore(src_buf), Ok(())); } let mut src_buf = src_buf.slice(0..src_buf_len); // First try to copy as much as we can into the buffer @@ -186,7 +258,7 @@ impl BlobWriter { // Then, if the buffer is full, flush it out if self.buf.len() == Self::CAPACITY { if let Err(e) = self.flush_buffer(ctx).await { - return (Slice::into_inner(src_buf), Err(e)); + return (restore(src_buf), Err(e)); } } // Finally, write the tail of src_buf: @@ -199,66 +271,118 @@ impl BlobWriter { let copied = self.write_into_buffer(&src_buf); // We just verified above that src_buf fits into our internal buffer. assert_eq!(copied, src_buf.len()); - Slice::into_inner(src_buf) + restore(src_buf) } else { - let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await; + let (src_buf, res) = self + .write_all_unbuffered(FullSlice::must_new(src_buf), ctx) + .await; if let Err(e) = res { return (src_buf, Err(e)); } src_buf } } else { - Slice::into_inner(src_buf) + restore(src_buf) }; (src_buf, Ok(())) } /// Write a blob of data. Returns the offset that it was written to, /// which can be used to retrieve the data later. - pub async fn write_blob, Buf: IoBuf + Send>( + pub async fn write_blob( &mut self, - srcbuf: B, + srcbuf: FullSlice, ctx: &RequestContext, - ) -> (B::Buf, Result) { - let offset = self.offset; + ) -> (FullSlice, Result) { + let (buf, res) = self + .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled) + .await; + (buf, res.map(|(off, _compression_info)| off)) + } - let len = srcbuf.bytes_init(); + /// Write a blob of data. Returns the offset that it was written to, + /// which can be used to retrieve the data later. + pub(crate) async fn write_blob_maybe_compressed( + &mut self, + srcbuf: FullSlice, + ctx: &RequestContext, + algorithm: ImageCompressionAlgorithm, + ) -> (FullSlice, Result<(u64, CompressionInfo), Error>) { + let offset = self.offset; + let mut compression_info = CompressionInfo { + written_compressed: false, + compressed_size: None, + }; + + let len = srcbuf.len(); let mut io_buf = self.io_buf.take().expect("we always put it back below"); io_buf.clear(); - let (io_buf, hdr_res) = async { + let mut compressed_buf = None; + let ((io_buf_slice, hdr_res), srcbuf) = async { if len < 128 { // Short blob. Write a 1-byte length header io_buf.put_u8(len as u8); - self.write_all(io_buf, ctx).await + (self.write_all(io_buf.slice_len(), ctx).await, srcbuf) } else { // Write a 4-byte length header - if len > 0x7fff_ffff { + if len > MAX_SUPPORTED_LEN { return ( - io_buf, - Err(Error::new( - ErrorKind::Other, - format!("blob too large ({len} bytes)"), - )), + ( + io_buf.slice_len(), + Err(Error::new( + ErrorKind::Other, + format!("blob too large ({len} bytes)"), + )), + ), + srcbuf, ); } - if len > 0x0fff_ffff { - tracing::warn!("writing blob above future limit ({len} bytes)"); - } - let mut len_buf = (len as u32).to_be_bytes(); - len_buf[0] |= 0x80; + let (high_bit_mask, len_written, srcbuf) = match algorithm { + ImageCompressionAlgorithm::Zstd { level } => { + let mut encoder = if let Some(level) = level { + async_compression::tokio::write::ZstdEncoder::with_quality( + Vec::new(), + Level::Precise(level.into()), + ) + } else { + async_compression::tokio::write::ZstdEncoder::new(Vec::new()) + }; + encoder.write_all(&srcbuf[..]).await.unwrap(); + encoder.shutdown().await.unwrap(); + let compressed = encoder.into_inner(); + compression_info.compressed_size = Some(compressed.len()); + if compressed.len() < len { + compression_info.written_compressed = true; + let compressed_len = compressed.len(); + compressed_buf = Some(compressed); + (BYTE_ZSTD, compressed_len, srcbuf) + } else { + (BYTE_UNCOMPRESSED, len, srcbuf) + } + } + ImageCompressionAlgorithm::Disabled => (BYTE_UNCOMPRESSED, len, srcbuf), + }; + let mut len_buf = (len_written as u32).to_be_bytes(); + assert_eq!(len_buf[0] & 0xf0, 0); + len_buf[0] |= high_bit_mask; io_buf.extend_from_slice(&len_buf[..]); - self.write_all(io_buf, ctx).await + (self.write_all(io_buf.slice_len(), ctx).await, srcbuf) } } .await; - self.io_buf = Some(io_buf); + self.io_buf = Some(io_buf_slice.into_raw_slice().into_inner()); match hdr_res { Ok(_) => (), - Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)), + Err(e) => return (srcbuf, Err(e)), } - let (srcbuf, res) = self.write_all(srcbuf, ctx).await; - (srcbuf, res.map(|_| offset)) + let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf { + let (_buf, res) = self.write_all(compressed_buf.slice_len(), ctx).await; + (srcbuf, res) + } else { + self.write_all(srcbuf, ctx).await + }; + (srcbuf, res.map(|_| (offset, compression_info))) } } @@ -289,37 +413,67 @@ impl BlobWriter { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use super::*; use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef}; + use camino::Utf8PathBuf; + use camino_tempfile::Utf8TempDir; use rand::{Rng, SeedableRng}; async fn round_trip_test(blobs: &[Vec]) -> Result<(), Error> { + round_trip_test_compressed::(blobs, false).await + } + + pub(crate) async fn write_maybe_compressed( + blobs: &[Vec], + compression: bool, + ctx: &RequestContext, + ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec), Error> { let temp_dir = camino_tempfile::tempdir()?; let pathbuf = temp_dir.path().join("file"); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); // Write part (in block to drop the file) let mut offsets = Vec::new(); { - let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?; + let file = VirtualFile::create(pathbuf.as_path(), ctx).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { - let (_, res) = wtr.write_blob(blob.clone(), &ctx).await; + let (_, res) = if compression { + let res = wtr + .write_blob_maybe_compressed( + blob.clone().slice_len(), + ctx, + ImageCompressionAlgorithm::Zstd { level: Some(1) }, + ) + .await; + (res.0, res.1.map(|(off, _)| off)) + } else { + wtr.write_blob(blob.clone().slice_len(), ctx).await + }; let offs = res?; offsets.push(offs); } // Write out one page worth of zeros so that we can // read again with read_blk - let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await; + let (_, res) = wtr.write_blob(vec![0; PAGE_SZ].slice_len(), ctx).await; let offs = res?; println!("Writing final blob at offs={offs}"); - wtr.flush_buffer(&ctx).await?; + wtr.flush_buffer(ctx).await?; } + Ok((temp_dir, pathbuf, offsets)) + } - let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?; + async fn round_trip_test_compressed( + blobs: &[Vec], + compression: bool, + ) -> Result<(), Error> { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let (_temp_dir, pathbuf, offsets) = + write_maybe_compressed::(blobs, compression, &ctx).await?; + + let file = VirtualFile::open(pathbuf, &ctx).await?; let rdr = BlockReaderRef::VirtualFile(&file); - let rdr = BlockCursor::new(rdr); + let rdr = BlockCursor::new_with_compression(rdr, compression); for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { let blob_read = rdr.read_blob(*offset, &ctx).await?; assert_eq!( @@ -330,7 +484,7 @@ mod tests { Ok(()) } - fn random_array(len: usize) -> Vec { + pub(crate) fn random_array(len: usize) -> Vec { let mut rng = rand::thread_rng(); (0..len).map(|_| rng.gen()).collect::<_>() } @@ -353,6 +507,8 @@ mod tests { ]; round_trip_test::(blobs).await?; round_trip_test::(blobs).await?; + round_trip_test_compressed::(blobs, true).await?; + round_trip_test_compressed::(blobs, true).await?; Ok(()) } @@ -361,10 +517,15 @@ mod tests { let blobs = &[ b"test".to_vec(), random_array(10 * PAGE_SZ), + b"hello".to_vec(), + random_array(66 * PAGE_SZ), + vec![0xf3; 24 * PAGE_SZ], b"foobar".to_vec(), ]; round_trip_test::(blobs).await?; round_trip_test::(blobs).await?; + round_trip_test_compressed::(blobs, true).await?; + round_trip_test_compressed::(blobs, true).await?; Ok(()) } diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 92928116c1..601b095155 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -37,6 +37,7 @@ where pub enum BlockLease<'a> { PageReadGuard(PageReadGuard<'static>), EphemeralFileMutableTail(&'a [u8; PAGE_SZ]), + Slice(&'a [u8; PAGE_SZ]), #[cfg(test)] Arc(std::sync::Arc<[u8; PAGE_SZ]>), #[cfg(test)] @@ -63,6 +64,7 @@ impl<'a> Deref for BlockLease<'a> { match self { BlockLease::PageReadGuard(v) => v.deref(), BlockLease::EphemeralFileMutableTail(v) => v, + BlockLease::Slice(v) => v, #[cfg(test)] BlockLease::Arc(v) => v.deref(), #[cfg(test)] @@ -81,6 +83,7 @@ pub(crate) enum BlockReaderRef<'a> { FileBlockReader(&'a FileBlockReader<'a>), EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), + Slice(&'a [u8]), #[cfg(test)] TestDisk(&'a super::disk_btree::tests::TestDisk), #[cfg(test)] @@ -99,6 +102,7 @@ impl<'a> BlockReaderRef<'a> { FileBlockReader(r) => r.read_blk(blknum, ctx).await, EphemeralFile(r) => r.read_blk(blknum, ctx).await, Adapter(r) => r.read_blk(blknum, ctx).await, + Slice(s) => Self::read_blk_slice(s, blknum), #[cfg(test)] TestDisk(r) => r.read_blk(blknum), #[cfg(test)] @@ -107,6 +111,24 @@ impl<'a> BlockReaderRef<'a> { } } +impl<'a> BlockReaderRef<'a> { + fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result { + let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap(); + let end = start.checked_add(PAGE_SZ).unwrap(); + if end > slice.len() { + return Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + format!("slice too short, len={} end={}", slice.len(), end), + )); + } + let slice = &slice[start..end]; + let page_sized: &[u8; PAGE_SZ] = slice + .try_into() + .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ"); + Ok(BlockLease::Slice(page_sized)) + } +} + /// /// A "cursor" for efficiently reading multiple pages from a BlockReader /// @@ -127,16 +149,24 @@ impl<'a> BlockReaderRef<'a> { /// ``` /// pub struct BlockCursor<'a> { + pub(super) read_compressed: bool, reader: BlockReaderRef<'a>, } impl<'a> BlockCursor<'a> { pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self { - BlockCursor { reader } + Self::new_with_compression(reader, false) + } + pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self { + BlockCursor { + read_compressed, + reader, + } } // Needed by cli pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self { BlockCursor { + read_compressed: false, reader: BlockReaderRef::FileBlockReader(reader), } } @@ -160,16 +190,23 @@ impl<'a> BlockCursor<'a> { /// /// The file is assumed to be immutable. This doesn't provide any functions /// for modifying the file, nor for invalidating the cache if it is modified. +#[derive(Clone)] pub struct FileBlockReader<'a> { pub file: &'a VirtualFile, /// Unique ID of this file, used as key in the page cache. file_id: page_cache::FileId, + + compressed_reads: bool, } impl<'a> FileBlockReader<'a> { pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self { - FileBlockReader { file_id, file } + FileBlockReader { + file_id, + file, + compressed_reads: true, + } } /// Read a page from the underlying file into given buffer. @@ -216,7 +253,10 @@ impl<'a> FileBlockReader<'a> { impl BlockReader for FileBlockReader<'_> { fn block_cursor(&self) -> BlockCursor<'_> { - BlockCursor::new(BlockReaderRef::FileBlockReader(self)) + BlockCursor::new_with_compression( + BlockReaderRef::FileBlockReader(self), + self.compressed_reads, + ) } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 1b9be12642..48ff17db94 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -281,22 +281,6 @@ impl LocationConf { } } -impl Default for LocationConf { - // TODO: this should be removed once tenant loading can guarantee that we are never - // loading from a directory without a configuration. - // => tech debt since https://github.com/neondatabase/neon/issues/1555 - fn default() -> Self { - Self { - mode: LocationMode::Attached(AttachedLocationConfig { - generation: Generation::none(), - attach_mode: AttachmentMode::Single, - }), - tenant_conf: TenantConfOpt::default(), - shard: ShardIdentity::unsharded(), - } - } -} - /// A tenant's calcuated configuration, which is the result of merging a /// tenant's TenantConfOpt with the global TenantConf from PageServerConf. /// @@ -351,7 +335,6 @@ pub struct TenantConf { /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, /// to avoid eager reconnects. pub max_lsn_wal_lag: NonZeroU64, - pub trace_read_requests: bool, pub eviction_policy: EvictionPolicy, pub min_resident_size_override: Option, // See the corresponding metric's help string. @@ -452,10 +435,6 @@ pub struct TenantConfOpt { #[serde(default)] pub max_lsn_wal_lag: Option, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub trace_read_requests: Option, - #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub eviction_policy: Option, @@ -535,9 +514,6 @@ impl TenantConfOpt { .lagging_wal_timeout .unwrap_or(global_conf.lagging_wal_timeout), max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), - trace_read_requests: self - .trace_read_requests - .unwrap_or(global_conf.trace_read_requests), eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), min_resident_size_override: self .min_resident_size_override @@ -597,7 +573,6 @@ impl Default for TenantConf { .expect("cannot parse default walreceiver lagging wal timeout"), max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) .expect("cannot parse default max walreceiver Lsn wal lag"), - trace_read_requests: false, eviction_policy: EvictionPolicy::NoEviction, min_resident_size_override: None, evictions_low_residence_duration_metric_threshold: humantime::parse_duration( @@ -675,7 +650,6 @@ impl From for models::TenantConfig { walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime), lagging_wal_timeout: value.lagging_wal_timeout.map(humantime), max_lsn_wal_lag: value.max_lsn_wal_lag, - trace_read_requests: value.trace_read_requests, eviction_policy: value.eviction_policy, min_resident_size_override: value.min_resident_size_override, evictions_low_residence_duration_metric_threshold: value diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs deleted file mode 100644 index 8b36aa15e5..0000000000 --- a/pageserver/src/tenant/delete.rs +++ /dev/null @@ -1,662 +0,0 @@ -use std::sync::Arc; - -use anyhow::Context; -use camino::{Utf8Path, Utf8PathBuf}; -use pageserver_api::{models::TenantState, shard::TenantShardId}; -use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; -use tokio::sync::OwnedMutexGuard; -use tokio_util::sync::CancellationToken; -use tracing::{error, instrument, Instrument}; - -use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint}; - -use crate::{ - config::PageServerConf, - context::RequestContext, - task_mgr::{self, TaskKind}, - tenant::{ - mgr::{TenantSlot, TenantsMapRemoveResult}, - remote_timeline_client::remote_heatmap_path, - timeline::ShutdownMode, - }, -}; - -use super::{ - mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap}, - remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, - span, - timeline::delete::DeleteTimelineFlow, - tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload, -}; - -#[derive(Debug, thiserror::Error)] -pub(crate) enum DeleteTenantError { - #[error("GetTenant {0}")] - Get(#[from] GetTenantError), - - #[error("Tenant not attached")] - NotAttached, - - #[error("Invalid state {0}. Expected Active or Broken")] - InvalidState(TenantState), - - #[error("Tenant deletion is already in progress")] - AlreadyInProgress, - - #[error("Tenant map slot error {0}")] - SlotError(#[from] TenantSlotError), - - #[error("Tenant map slot upsert error {0}")] - SlotUpsertError(#[from] TenantSlotUpsertError), - - #[error("Timeline {0}")] - Timeline(#[from] DeleteTimelineError), - - #[error("Cancelled")] - Cancelled, - - #[error(transparent)] - Other(#[from] anyhow::Error), -} - -type DeletionGuard = tokio::sync::OwnedMutexGuard; - -fn remote_tenant_delete_mark_path( - conf: &PageServerConf, - tenant_shard_id: &TenantShardId, -) -> anyhow::Result { - let tenant_remote_path = conf - .tenant_path(tenant_shard_id) - .strip_prefix(&conf.workdir) - .context("Failed to strip workdir prefix") - .and_then(RemotePath::new) - .context("tenant path")?; - Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted"))) -} - -async fn create_remote_delete_mark( - conf: &PageServerConf, - remote_storage: &GenericRemoteStorage, - tenant_shard_id: &TenantShardId, - cancel: &CancellationToken, -) -> Result<(), DeleteTenantError> { - let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; - - let data: &[u8] = &[]; - backoff::retry( - || async { - let data = bytes::Bytes::from_static(data); - let stream = futures::stream::once(futures::future::ready(Ok(data))); - remote_storage - .upload(stream, 0, &remote_mark_path, None, cancel) - .await - }, - TimeoutOrCancel::caused_by_cancel, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "mark_upload", - cancel, - ) - .await - .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) - .and_then(|x| x) - .context("mark_upload")?; - - Ok(()) -} - -async fn create_local_delete_mark( - conf: &PageServerConf, - tenant_shard_id: &TenantShardId, -) -> Result<(), DeleteTenantError> { - let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id); - - // Note: we're ok to replace existing file. - let _ = std::fs::OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(&marker_path) - .with_context(|| format!("could not create delete marker file {marker_path:?}"))?; - - crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?; - - Ok(()) -} - -async fn schedule_ordered_timeline_deletions( - tenant: &Arc, -) -> Result>, TimelineId)>, DeleteTenantError> { - // Tenant is stopping at this point. We know it will be deleted. - // No new timelines should be created. - // Tree sort timelines to delete from leafs to the root. - // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion - // can complete and remove timeline from the map in between our call to clone - // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map. - // timelines.lock is currently synchronous so we cant hold it across await point. - // So just ignore NotFound error if we get it from `run`. - // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock. - let timelines = tenant.timelines.lock().unwrap().clone(); - let sorted = - tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?; - - let mut already_running_deletions = vec![]; - - for (timeline_id, _) in sorted.into_iter().rev() { - let span = tracing::info_span!("timeline_delete", %timeline_id); - let res = DeleteTimelineFlow::run(tenant, timeline_id, true) - .instrument(span) - .await; - if let Err(e) = res { - match e { - DeleteTimelineError::NotFound => { - // Timeline deletion finished after call to clone above but before call - // to `DeleteTimelineFlow::run` and removed timeline from the map. - continue; - } - DeleteTimelineError::AlreadyInProgress(guard) => { - already_running_deletions.push((guard, timeline_id)); - continue; - } - e => return Err(DeleteTenantError::Timeline(e)), - } - } - } - - Ok(already_running_deletions) -} - -async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> { - // Assert timelines dir is empty. - if !fs_ext::is_directory_empty(timelines_path).await? { - // Display first 10 items in directory - let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?; - let list = &list.into_iter().take(10).collect::>(); - return Err(DeleteTenantError::Other(anyhow::anyhow!( - "Timelines directory is not empty after all timelines deletion: {list:?}" - ))); - } - - Ok(()) -} - -async fn remove_tenant_remote_delete_mark( - conf: &PageServerConf, - remote_storage: &GenericRemoteStorage, - tenant_shard_id: &TenantShardId, - cancel: &CancellationToken, -) -> Result<(), DeleteTenantError> { - let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; - backoff::retry( - || async { remote_storage.delete(&path, cancel).await }, - TimeoutOrCancel::caused_by_cancel, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "remove_tenant_remote_delete_mark", - cancel, - ) - .await - .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) - .and_then(|x| x) - .context("remove_tenant_remote_delete_mark")?; - Ok(()) -} - -// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir -async fn cleanup_remaining_fs_traces( - conf: &PageServerConf, - tenant_shard_id: &TenantShardId, -) -> Result<(), DeleteTenantError> { - let rm = |p: Utf8PathBuf, is_dir: bool| async move { - if is_dir { - tokio::fs::remove_dir(&p).await - } else { - tokio::fs::remove_file(&p).await - } - .or_else(fs_ext::ignore_not_found) - .with_context(|| format!("failed to delete {p}")) - }; - - rm(conf.tenant_config_path(tenant_shard_id), false).await?; - rm(conf.tenant_location_config_path(tenant_shard_id), false).await?; - - fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-remove-timelines-dir" - ))? - }); - - rm(conf.timelines_path(tenant_shard_id), true).await?; - - fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-remove-deleted-mark" - ))? - }); - - // Make sure previous deletions are ordered before mark removal. - // Otherwise there is no guarantee that they reach the disk before mark deletion. - // So its possible for mark to reach disk first and for other deletions - // to be reordered later and thus missed if a crash occurs. - // Note that we dont need to sync after mark file is removed - // because we can tolerate the case when mark file reappears on startup. - let tenant_path = &conf.tenant_path(tenant_shard_id); - if tenant_path.exists() { - crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id)) - .await - .context("fsync_pre_mark_remove")?; - } - - rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?; - - rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?; - - fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-remove-tenant-dir" - ))? - }); - - rm(conf.tenant_path(tenant_shard_id), true).await?; - - Ok(()) -} - -/// Orchestrates tenant shut down of all tasks, removes its in-memory structures, -/// and deletes its data from both disk and s3. -/// The sequence of steps: -/// 1. Upload remote deletion mark. -/// 2. Create local mark file. -/// 3. Shutdown tasks -/// 4. Run ordered timeline deletions -/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested -/// 6. Remove remote mark -/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark -/// It is resumable from any step in case a crash/restart occurs. -/// There are two entrypoints to the process: -/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler. -/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process. -/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function. -#[derive(Default)] -pub enum DeleteTenantFlow { - #[default] - NotStarted, - InProgress, - Finished, -} - -impl DeleteTenantFlow { - // These steps are run in the context of management api request handler. - // Long running steps are continued to run in the background. - // NB: If this fails half-way through, and is retried, the retry will go through - // all the same steps again. Make sure the code here is idempotent, and don't - // error out if some of the shutdown tasks have already been completed! - // NOTE: static needed for background part. - // We assume that calling code sets up the span with tenant_id. - #[instrument(skip_all)] - pub(crate) async fn run( - conf: &'static PageServerConf, - remote_storage: GenericRemoteStorage, - tenants: &'static std::sync::RwLock, - tenant: Arc, - cancel: &CancellationToken, - ) -> Result<(), DeleteTenantError> { - span::debug_assert_current_span_has_tenant_id(); - - pausable_failpoint!("tenant-delete-before-run"); - - let mut guard = Self::prepare(&tenant).await?; - - if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await { - tenant.set_broken(format!("{e:#}")).await; - return Err(e); - } - - Self::schedule_background(guard, conf, remote_storage, tenants, tenant); - - Ok(()) - } - - // Helper function needed to be able to match once on returned error and transition tenant into broken state. - // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown - // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried. - // So the solution is to set tenant state to broken. - async fn run_inner( - guard: &mut OwnedMutexGuard, - conf: &'static PageServerConf, - remote_storage: &GenericRemoteStorage, - tenant: &Tenant, - cancel: &CancellationToken, - ) -> Result<(), DeleteTenantError> { - guard.mark_in_progress()?; - - fail::fail_point!("tenant-delete-before-create-remote-mark", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-create-remote-mark" - ))? - }); - - create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel) - .await - .context("remote_mark")?; - - fail::fail_point!("tenant-delete-before-create-local-mark", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-create-local-mark" - ))? - }); - - create_local_delete_mark(conf, &tenant.tenant_shard_id) - .await - .context("local delete mark")?; - - fail::fail_point!("tenant-delete-before-background", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-background" - ))? - }); - - Ok(()) - } - - fn mark_in_progress(&mut self) -> anyhow::Result<()> { - match self { - Self::Finished => anyhow::bail!("Bug. Is in finished state"), - Self::InProgress { .. } => { /* We're in a retry */ } - Self::NotStarted => { /* Fresh start */ } - } - - *self = Self::InProgress; - - Ok(()) - } - - pub(crate) async fn should_resume_deletion( - conf: &'static PageServerConf, - remote_mark_exists: bool, - tenant: &Tenant, - ) -> Result, DeleteTenantError> { - let acquire = |t: &Tenant| { - Some( - Arc::clone(&t.delete_progress) - .try_lock_owned() - .expect("we're the only owner during init"), - ) - }; - - if remote_mark_exists { - return Ok(acquire(tenant)); - } - - // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists. - if conf - .tenant_deleted_mark_file_path(&tenant.tenant_shard_id) - .exists() - { - Ok(acquire(tenant)) - } else { - Ok(None) - } - } - - pub(crate) async fn resume_from_attach( - guard: DeletionGuard, - tenant: &Arc, - preload: Option, - tenants: &'static std::sync::RwLock, - ctx: &RequestContext, - ) -> Result<(), DeleteTenantError> { - let (_, progress) = completion::channel(); - - tenant - .set_stopping(progress, false, true) - .await - .expect("cant be stopping or broken"); - - tenant - .attach(preload, super::SpawnMode::Eager, ctx) - .await - .context("attach")?; - - Self::background( - guard, - tenant.conf, - tenant.remote_storage.clone(), - tenants, - tenant, - ) - .await - } - - /// Check whether background deletion of this tenant is currently in progress - pub(crate) fn is_in_progress(tenant: &Tenant) -> bool { - tenant.delete_progress.try_lock().is_err() - } - - async fn prepare( - tenant: &Arc, - ) -> Result, DeleteTenantError> { - // FIXME: unsure about active only. Our init jobs may not be cancellable properly, - // so at least for now allow deletions only for active tenants. TODO recheck - // Broken and Stopping is needed for retries. - if !matches!( - tenant.current_state(), - TenantState::Active | TenantState::Broken { .. } - ) { - return Err(DeleteTenantError::InvalidState(tenant.current_state())); - } - - let guard = Arc::clone(&tenant.delete_progress) - .try_lock_owned() - .map_err(|_| DeleteTenantError::AlreadyInProgress)?; - - fail::fail_point!("tenant-delete-before-shutdown", |_| { - Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))? - }); - - // make pageserver shutdown not to wait for our completion - let (_, progress) = completion::channel(); - - // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent. - // i e it is an error to do: - // tenant.set_stopping - // tenant.shutdown - // Its also bad that we're holding tenants.read here. - // TODO relax set_stopping to be idempotent? - if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() { - return Err(DeleteTenantError::Other(anyhow::anyhow!( - "tenant shutdown is already in progress" - ))); - } - - Ok(guard) - } - - fn schedule_background( - guard: OwnedMutexGuard, - conf: &'static PageServerConf, - remote_storage: GenericRemoteStorage, - tenants: &'static std::sync::RwLock, - tenant: Arc, - ) { - let tenant_shard_id = tenant.tenant_shard_id; - - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::TimelineDeletionWorker, - Some(tenant_shard_id), - None, - "tenant_delete", - false, - async move { - if let Err(err) = - Self::background(guard, conf, remote_storage, tenants, &tenant).await - { - error!("Error: {err:#}"); - tenant.set_broken(format!("{err:#}")).await; - }; - Ok(()) - } - .instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())), - ); - } - - async fn background( - mut guard: OwnedMutexGuard, - conf: &PageServerConf, - remote_storage: GenericRemoteStorage, - tenants: &'static std::sync::RwLock, - tenant: &Arc, - ) -> Result<(), DeleteTenantError> { - // Tree sort timelines, schedule delete for them. Mention retries from the console side. - // Note that if deletion fails we dont mark timelines as broken, - // the whole tenant will become broken as by `Self::schedule_background` logic - let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant) - .await - .context("schedule_ordered_timeline_deletions")?; - - fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-polling-ongoing-deletions" - ))? - }); - - // Wait for deletions that were already running at the moment when tenant deletion was requested. - // When we can lock deletion guard it means that corresponding timeline deletion finished. - for (guard, timeline_id) in already_running_timeline_deletions { - let flow = guard.lock().await; - if !flow.is_finished() { - return Err(DeleteTenantError::Other(anyhow::anyhow!( - "already running timeline deletion failed: {timeline_id}" - ))); - } - } - - // Remove top-level tenant objects that don't belong to a timeline, such as heatmap - let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id()); - if let Some(Err(e)) = backoff::retry( - || async { - remote_storage - .delete(&heatmap_path, &task_mgr::shutdown_token()) - .await - }, - TimeoutOrCancel::caused_by_cancel, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "remove_remote_tenant_heatmap", - &task_mgr::shutdown_token(), - ) - .await - { - tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}"); - } - - let timelines_path = conf.timelines_path(&tenant.tenant_shard_id); - // May not exist if we fail in cleanup_remaining_fs_traces after removing it - if timelines_path.exists() { - // sanity check to guard against layout changes - ensure_timelines_dir_empty(&timelines_path) - .await - .context("timelines dir not empty")?; - } - - remove_tenant_remote_delete_mark( - conf, - &remote_storage, - &tenant.tenant_shard_id, - &task_mgr::shutdown_token(), - ) - .await?; - - pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable"); - fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-cleanup-remaining-fs-traces" - ))? - }); - - cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id) - .await - .context("cleanup_remaining_fs_traces")?; - - { - pausable_failpoint!("tenant-delete-before-map-remove"); - - // This block is simply removing the TenantSlot for this tenant. It requires a loop because - // we might conflict with a TenantSlot::InProgress marker and need to wait for it. - // - // This complexity will go away when we simplify how deletion works: - // https://github.com/neondatabase/neon/issues/5080 - loop { - // Under the TenantMap lock, try to remove the tenant. We usually succeed, but if - // we encounter an InProgress marker, yield the barrier it contains and wait on it. - let barrier = { - let mut locked = tenants.write().unwrap(); - let removed = locked.remove(tenant.tenant_shard_id); - - // FIXME: we should not be modifying this from outside of mgr.rs. - // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080) - - // Update stats - match &removed { - TenantsMapRemoveResult::Occupied(slot) => { - crate::metrics::TENANT_MANAGER.slot_removed(slot); - } - TenantsMapRemoveResult::InProgress(barrier) => { - crate::metrics::TENANT_MANAGER - .slot_removed(&TenantSlot::InProgress(barrier.clone())); - } - TenantsMapRemoveResult::Vacant => { - // Nothing changed in map, no metric update - } - } - - match removed { - TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => { - match tenant.current_state() { - TenantState::Stopping { .. } | TenantState::Broken { .. } => { - // Expected: we put the tenant into stopping state before we start deleting it - } - state => { - // Unexpected state - tracing::warn!( - "Tenant in unexpected state {state} after deletion" - ); - } - } - break; - } - TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => { - // This is unexpected: this secondary tenants should not have been created, and we - // are not in a position to shut it down from here. - tracing::warn!("Tenant transitioned to secondary mode while deleting!"); - break; - } - TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => { - unreachable!("TenantsMap::remove handles InProgress separately, should never return it here"); - } - TenantsMapRemoveResult::Vacant => { - tracing::warn!( - "Tenant removed from TenantsMap before deletion completed" - ); - break; - } - TenantsMapRemoveResult::InProgress(barrier) => { - // An InProgress entry was found, we must wait on its barrier - barrier - } - } - }; - - tracing::info!( - "Waiting for competing operation to complete before deleting state for tenant" - ); - barrier.wait().await; - } - } - - *guard = Self::Finished; - - Ok(()) - } -} diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 119df3e6c4..0107b0ac7e 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -212,6 +212,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { /// /// Public reader object, to search the tree. /// +#[derive(Clone)] pub struct DiskBtreeReader where R: BlockReader, @@ -259,38 +260,55 @@ where Ok(result) } - pub fn iter<'a>( - &'a self, - start_key: &'a [u8; L], - ctx: &'a RequestContext, - ) -> DiskBtreeIterator<'a> { + pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a> + where + R: 'a + Send, + { DiskBtreeIterator { - stream: Box::pin(self.get_stream_from(start_key, ctx)), + stream: Box::pin(self.into_stream(start_key, ctx)), } } /// Return a stream which yields all key, value pairs from the index /// starting from the first key greater or equal to `start_key`. /// - /// Note that this is a copy of [`Self::visit`]. + /// Note 1: that this is a copy of [`Self::visit`]. /// TODO: Once the sequential read path is removed this will become /// the only index traversal method. - pub fn get_stream_from<'a>( - &'a self, + /// + /// Note 2: this function used to take `&self` but it now consumes `self`. This is due to + /// the lifetime constraints of the reader and the stream / iterator it creates. Using `&self` + /// requires the reader to be present when the stream is used, and this creates a lifetime + /// dependency between the reader and the stream. Now if we want to create an iterator that + /// holds the stream, someone will need to keep a reference to the reader, which is inconvenient + /// to use from the image/delta layer APIs. + /// + /// Feel free to add the `&self` variant back if it's necessary. + pub fn into_stream<'a>( + self, start_key: &'a [u8; L], ctx: &'a RequestContext, - ) -> impl Stream, u64), DiskBtreeError>> + 'a { + ) -> impl Stream, u64), DiskBtreeError>> + 'a + where + R: 'a, + { try_stream! { let mut stack = Vec::new(); stack.push((self.root_blk, None)); let block_cursor = self.reader.block_cursor(); + let mut node_buf = [0_u8; PAGE_SZ]; while let Some((node_blknum, opt_iter)) = stack.pop() { - // Locate the node. - let node_buf = block_cursor + // Read the node, through the PS PageCache, into local variable `node_buf`. + // We could keep the page cache read guard alive, but, at the time of writing, + // we run quite small PS PageCache s => can't risk running out of + // PageCache space because this stream isn't consumed fast enough. + let page_read_guard = block_cursor .read_blk(self.start_blk + node_blknum, ctx) .await?; + node_buf.copy_from_slice(page_read_guard.as_ref()); + drop(page_read_guard); // drop page cache read guard early - let node = OnDiskNode::deparse(node_buf.as_ref())?; + let node = OnDiskNode::deparse(&node_buf)?; let prefix_len = node.prefix_len as usize; let suffix_len = node.suffix_len as usize; @@ -333,6 +351,7 @@ where Either::Left(idx..node.num_children.into()) }; + // idx points to the first match now. Keep going from there while let Some(idx) = iter.next() { let key_off = idx * suffix_len; @@ -509,7 +528,7 @@ where pub struct DiskBtreeIterator<'a> { #[allow(clippy::type_complexity)] stream: std::pin::Pin< - Box, u64), DiskBtreeError>> + 'a>, + Box, u64), DiskBtreeError>> + 'a + Send>, >, } @@ -538,10 +557,10 @@ where /// We maintain the length of the stack to be always greater than zero. /// Two exceptions are: /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one. - /// So because other methods cannot see the intermediate state invariant still holds. + /// So because other methods cannot see the intermediate state invariant still holds. /// 2. `Self::finish`. It consumes self and does not return it back, - /// which means that this is where the structure is destroyed. - /// Thus stack of zero length cannot be observed by other methods. + /// which means that this is where the structure is destroyed. + /// Thus stack of zero length cannot be observed by other methods. stack: Vec>, /// Last key that was appended to the tree. Used to sanity check that append diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 79cc7bf153..44f0fc7ab1 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -28,6 +28,7 @@ impl EphemeralFile { conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + gate_guard: utils::sync::gate::GateGuard, ctx: &RequestContext, ) -> Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); @@ -53,7 +54,7 @@ impl EphemeralFile { Ok(EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - rw: page_caching::RW::new(file), + rw: page_caching::RW::new(file, gate_guard), }) } @@ -65,6 +66,11 @@ impl EphemeralFile { self.rw.page_cache_file_id() } + /// See [`self::page_caching::RW::load_to_vec`]. + pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { + self.rw.load_to_vec(ctx).await + } + pub(crate) async fn read_blk( &self, blknum: u32, @@ -73,6 +79,8 @@ impl EphemeralFile { self.rw.read_blk(blknum, ctx).await } + #[cfg(test)] + // This is a test helper: outside of tests, we are always written to via a pre-serialized batch. pub(crate) async fn write_blob( &mut self, srcbuf: &[u8], @@ -80,17 +88,30 @@ impl EphemeralFile { ) -> Result { let pos = self.rw.bytes_written(); - // Write the length field - if srcbuf.len() < 0x80 { - // short one-byte length header - let len_buf = [srcbuf.len() as u8]; + let mut len_bytes = std::io::Cursor::new(Vec::new()); + crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length( + srcbuf.len(), + &mut len_bytes, + ); + let len_bytes = len_bytes.into_inner(); - self.rw.write_all_borrowed(&len_buf, ctx).await?; - } else { - let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32); - len_buf[0] |= 0x80; - self.rw.write_all_borrowed(&len_buf, ctx).await?; - } + // Write the length field + self.rw.write_all_borrowed(&len_bytes, ctx).await?; + + // Write the payload + self.rw.write_all_borrowed(srcbuf, ctx).await?; + + Ok(pos) + } + + /// Returns the offset at which the first byte of the input was written, for use + /// in constructing indices over the written value. + pub(crate) async fn write_raw( + &mut self, + srcbuf: &[u8], + ctx: &RequestContext, + ) -> Result { + let pos = self.rw.bytes_written(); // Write the payload self.rw.write_all_borrowed(srcbuf, ctx).await?; @@ -155,7 +176,11 @@ mod tests { async fn test_ephemeral_blobs() -> Result<(), io::Error> { let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?; + let gate = utils::sync::gate::Gate::default(); + + let entered = gate.enter().unwrap(); + + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?; let pos_foo = file.write_blob(b"foo", &ctx).await?; assert_eq!( @@ -209,4 +234,38 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn ephemeral_file_holds_gate_open() { + const FOREVER: std::time::Duration = std::time::Duration::from_secs(5); + + let (conf, tenant_id, timeline_id, ctx) = + harness("ephemeral_file_holds_gate_open").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let mut closing = tokio::task::spawn(async move { + gate.close().await; + }); + + // gate is entered until the ephemeral file is dropped + // do not start paused tokio-epoll-uring has a sleep loop + tokio::time::pause(); + tokio::time::timeout(FOREVER, &mut closing) + .await + .expect_err("closing cannot complete before dropping"); + + // this is a requirement of the reset_tenant functionality: we have to be able to restart a + // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate + drop(file); + + tokio::time::timeout(FOREVER, &mut closing) + .await + .expect("closing completes right away") + .expect("closing does not panic"); + } } diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 276ac87064..48926354f1 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -1,13 +1,15 @@ //! Wrapper around [`super::zero_padded_read_write::RW`] that uses the //! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`]. +//! +//! Subject to removal in use crate::context::RequestContext; use crate::page_cache::{self, PAGE_SZ}; use crate::tenant::block_io::BlockLease; +use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; use crate::virtual_file::VirtualFile; -use once_cell::sync::Lazy; -use std::io::{self, ErrorKind}; +use std::io::{self}; use tokio_epoll_uring::BoundedBuf; use tracing::*; @@ -16,18 +18,18 @@ use super::zero_padded_read_write; /// See module-level comment. pub struct RW { page_cache_file_id: page_cache::FileId, - rw: super::zero_padded_read_write::RW, + rw: super::zero_padded_read_write::RW>, + /// Gate guard is held on as long as we need to do operations in the path (delete on drop). + _gate_guard: utils::sync::gate::GateGuard, } impl RW { - pub fn new(file: VirtualFile) -> Self { + pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self { let page_cache_file_id = page_cache::next_file_id(); Self { page_cache_file_id, - rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new( - page_cache_file_id, - file, - )), + rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)), + _gate_guard, } } @@ -49,6 +51,43 @@ impl RW { self.rw.bytes_written() } + /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer. + /// + /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer. + /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`]. + pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { + // round up to the next PAGE_SZ multiple, required by blob_io + let size = { + let s = usize::try_from(self.bytes_written()).unwrap(); + if s % PAGE_SZ == 0 { + s + } else { + s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap() + } + }; + let vec = Vec::with_capacity(size); + + // read from disk what we've already flushed + let file_size_tracking_writer = self.rw.as_writer(); + let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap(); + let mut vec = file_size_tracking_writer + .as_inner() + .read_exact_at( + vec.slice(0..(flushed_range.end - flushed_range.start)), + u64::try_from(flushed_range.start).unwrap(), + ctx, + ) + .await? + .into_inner(); + + // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk + let buffered = self.rw.get_tail_zero_padded(); + vec.extend_from_slice(buffered); + assert_eq!(vec.len(), size); + assert_eq!(vec.len() % PAGE_SZ, 0); + Ok(vec) + } + pub(crate) async fn read_blk( &self, blknum: u32, @@ -67,7 +106,7 @@ impl RW { format!( "ephemeral file: read immutable page #{}: {}: {:#}", blknum, - self.rw.as_writer().file.path, + self.rw.as_writer().as_inner().path, e, ), ) @@ -77,7 +116,7 @@ impl RW { } page_cache::ReadBufResult::NotFound(write_guard) => { let write_guard = writer - .file + .as_inner() .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx) .await?; let read_guard = write_guard.mark_valid(); @@ -98,126 +137,17 @@ impl Drop for RW { // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. // unlink the file - let res = std::fs::remove_file(&self.rw.as_writer().file.path); + // we are clear to do this, because we have entered a gate + let path = &self.rw.as_writer().as_inner().path; + let res = std::fs::remove_file(path); if let Err(e) = res { if e.kind() != std::io::ErrorKind::NotFound { // just never log the not found errors, we cannot do anything for them; on detach // the tenant directory is already gone. // // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 - error!( - "could not remove ephemeral file '{}': {}", - self.rw.as_writer().file.path, - e - ); + error!("could not remove ephemeral file '{path}': {e}"); } } } } - -struct PreWarmingWriter { - nwritten_blocks: u32, - page_cache_file_id: page_cache::FileId, - file: VirtualFile, -} - -impl PreWarmingWriter { - fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self { - Self { - nwritten_blocks: 0, - page_cache_file_id, - file, - } - } -} - -impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter { - async fn write_all< - B: tokio_epoll_uring::BoundedBuf, - Buf: tokio_epoll_uring::IoBuf + Send, - >( - &mut self, - buf: B, - ctx: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)> { - let buf = buf.slice(..); - let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done - let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) { - Some(buf.to_vec()) - } else { - None - }; - let buflen = buf.len(); - assert_eq!( - buflen % PAGE_SZ, - 0, - "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used" - ); - - // Do the IO. - let iobuf = match self.file.write_all(buf, ctx).await { - (iobuf, Ok(nwritten)) => { - assert_eq!(nwritten, buflen); - iobuf - } - (_, Err(e)) => { - return Err(std::io::Error::new( - ErrorKind::Other, - // order error before path because path is long and error is short - format!( - "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}", - self.nwritten_blocks, buflen, e, self.file.path, - ), - )); - } - }; - - // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf) - let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds); - if let Some(check_bounds_stuff_works) = check_bounds_stuff_works { - assert_eq!(&check_bounds_stuff_works, &*buf); - } - - // Pre-warm page cache with the contents. - // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming - // benefits the code that writes InMemoryLayer=>L0 layers. - let nblocks = buflen / PAGE_SZ; - let nblocks32 = u32::try_from(nblocks).unwrap(); - let cache = page_cache::get(); - static CTX: Lazy = Lazy::new(|| { - RequestContext::new( - crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache, - crate::context::DownloadBehavior::Error, - ) - }); - for blknum_in_buffer in 0..nblocks { - let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ]; - let blknum = self - .nwritten_blocks - .checked_add(blknum_in_buffer as u32) - .unwrap(); - match cache - .read_immutable_buf(self.page_cache_file_id, blknum, &CTX) - .await - { - Err(e) => { - error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); - // fail gracefully, it's not the end of the world if we can't pre-warm the cache here - } - Ok(v) => match v { - page_cache::ReadBufResult::Found(_guard) => { - // This function takes &mut self, so, it shouldn't be possible to reach this point. - unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \ - and this function takes &mut self, so, no concurrent read_blk is possible"); - } - page_cache::ReadBufResult::NotFound(mut write_guard) => { - write_guard.copy_from_slice(blk_in_buffer); - let _ = write_guard.mark_valid(); - } - }, - } - } - self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap(); - Ok((buflen, buf.into_inner())) - } -} diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs index b37eafb52c..fe310acab8 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -75,6 +75,21 @@ where flushed_offset + u64::try_from(buffer.pending()).unwrap() } + /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`]. + pub fn get_tail_zero_padded(&self) -> &[u8] { + let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); + let buffer_written_up_to = buffer.pending(); + // pad to next page boundary + let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 { + buffer_written_up_to + } else { + buffer_written_up_to + .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ)) + .unwrap() + }; + &buffer.as_zero_padded_slice()[0..read_up_to] + } + pub(crate) async fn read_blk(&self, blknum: u32) -> Result, std::io::Error> { let flushed_offset = self.buffered_writer.as_inner().bytes_written(); let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs index f90291bbf8..2dc0277638 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs @@ -5,6 +5,8 @@ use std::mem::MaybeUninit; +use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice; + /// See module-level comment. pub struct Buffer { allocation: Box<[u8; N]>, @@ -60,10 +62,10 @@ impl crate::virtual_file::owned_buffers_io::write::Buffer for Bu self.written } - fn flush(self) -> tokio_epoll_uring::Slice { + fn flush(self) -> FullSlice { self.invariants(); let written = self.written; - tokio_epoll_uring::BoundedBuf::slice(self, 0..written) + FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written)) } fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs new file mode 100644 index 0000000000..8b41ba1746 --- /dev/null +++ b/pageserver/src/tenant/gc_block.rs @@ -0,0 +1,213 @@ +use std::collections::HashMap; + +use utils::id::TimelineId; + +use super::remote_timeline_client::index::GcBlockingReason; + +type Storage = HashMap>; + +#[derive(Default)] +pub(crate) struct GcBlock { + /// The timelines which have current reasons to block gc. + /// + /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done + /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`. + reasons: std::sync::Mutex, + blocking: tokio::sync::Mutex<()>, +} + +impl GcBlock { + /// Start another gc iteration. + /// + /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with + /// it's ending, or if not currently possible, a value describing the reasons why not. + /// + /// Cancellation safe. + pub(super) async fn start(&self) -> Result, BlockingReasons> { + let reasons = { + let g = self.reasons.lock().unwrap(); + + // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in + // tests, we use everything. we should warn if the gc has been consecutively blocked + // for more than 1h (within single tenant session?). + BlockingReasons::clean_and_summarize(g) + }; + + if let Some(reasons) = reasons { + Err(reasons) + } else { + Ok(Guard { + _inner: self.blocking.lock().await, + }) + } + } + + pub(crate) fn summary(&self) -> Option { + let g = self.reasons.lock().unwrap(); + + BlockingReasons::summarize(&g) + } + + /// Start blocking gc for this one timeline for the given reason. + /// + /// This is not a guard based API but instead it mimics set API. The returned future will not + /// resolve until an existing gc round has completed. + /// + /// Returns true if this block was new, false if gc was already blocked for this reason. + /// + /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will + /// keep the gc blocking reason. + pub(crate) async fn insert( + &self, + timeline: &super::Timeline, + reason: GcBlockingReason, + ) -> anyhow::Result { + let (added, uploaded) = { + let mut g = self.reasons.lock().unwrap(); + let set = g.entry(timeline.timeline_id).or_default(); + let added = set.insert(reason); + + // LOCK ORDER: intentionally hold the lock, see self.reasons. + let uploaded = timeline + .remote_client + .schedule_insert_gc_block_reason(reason)?; + + (added, uploaded) + }; + + uploaded.await?; + + // ensure that any ongoing gc iteration has completed + drop(self.blocking.lock().await); + + Ok(added) + } + + /// Remove blocking gc for this one timeline and the given reason. + pub(crate) async fn remove( + &self, + timeline: &super::Timeline, + reason: GcBlockingReason, + ) -> anyhow::Result<()> { + use std::collections::hash_map::Entry; + + super::span::debug_assert_current_span_has_tenant_and_timeline_id(); + + let (remaining_blocks, uploaded) = { + let mut g = self.reasons.lock().unwrap(); + match g.entry(timeline.timeline_id) { + Entry::Occupied(mut oe) => { + let set = oe.get_mut(); + set.remove(reason); + if set.is_empty() { + oe.remove(); + } + } + Entry::Vacant(_) => { + // we must still do the index_part.json update regardless, in case we had earlier + // been cancelled + } + } + + let remaining_blocks = g.len(); + + // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons + let uploaded = timeline + .remote_client + .schedule_remove_gc_block_reason(reason)?; + + (remaining_blocks, uploaded) + }; + uploaded.await?; + + // no need to synchronize with gc iteration again + + if remaining_blocks > 0 { + tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked"); + } else { + tracing::info!("gc is now unblocked for the tenant"); + } + + Ok(()) + } + + pub(crate) fn before_delete(&self, timeline: &super::Timeline) { + let unblocked = { + let mut g = self.reasons.lock().unwrap(); + if g.is_empty() { + return; + } + + g.remove(&timeline.timeline_id); + + BlockingReasons::clean_and_summarize(g).is_none() + }; + + if unblocked { + tracing::info!("gc is now unblocked following deletion"); + } + } + + /// Initialize with the non-deleted timelines of this tenant. + pub(crate) fn set_scanned(&self, scanned: Storage) { + let mut g = self.reasons.lock().unwrap(); + assert!(g.is_empty()); + g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty())); + + if let Some(reasons) = BlockingReasons::clean_and_summarize(g) { + tracing::info!(summary=?reasons, "initialized with gc blocked"); + } + } +} + +pub(super) struct Guard<'a> { + _inner: tokio::sync::MutexGuard<'a, ()>, +} + +#[derive(Debug)] +pub(crate) struct BlockingReasons { + timelines: usize, + reasons: enumset::EnumSet, +} + +impl std::fmt::Display for BlockingReasons { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{} timelines block for {:?}", + self.timelines, self.reasons + ) + } +} + +impl BlockingReasons { + fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option { + let mut reasons = enumset::EnumSet::empty(); + g.retain(|_key, value| { + reasons = reasons.union(*value); + !value.is_empty() + }); + if !g.is_empty() { + Some(BlockingReasons { + timelines: g.len(), + reasons, + }) + } else { + None + } + } + + fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option { + if g.is_empty() { + None + } else { + let reasons = g + .values() + .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next)); + Some(BlockingReasons { + timelines: g.len(), + reasons, + }) + } + } +} diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 2724a5cc07..844f117ea2 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -51,7 +51,8 @@ use crate::keyspace::KeyPartitioning; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use anyhow::Result; -use pageserver_api::keyspace::KeySpaceAccum; +use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; +use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; use std::collections::{HashMap, VecDeque}; use std::iter::Peekable; use std::ops::Range; @@ -61,7 +62,7 @@ use utils::lsn::Lsn; use historic_layer_coverage::BufferedHistoricLayerCoverage; pub use historic_layer_coverage::LayerKey; -use super::storage_layer::PersistentLayerDesc; +use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; /// /// LayerMap tracks what layers exist on a timeline. @@ -463,7 +464,7 @@ impl LayerMap { pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) { // TODO: See #3869, resulting #4088, attempted fix and repro #4094 - if Self::is_l0(&layer_desc) { + if Self::is_l0(&layer_desc.key_range) { self.l0_delta_layers.push(layer_desc.clone().into()); } @@ -482,7 +483,7 @@ impl LayerMap { self.historic .remove(historic_layer_coverage::LayerKey::from(layer_desc)); let layer_key = layer_desc.key(); - if Self::is_l0(layer_desc) { + if Self::is_l0(&layer_desc.key_range) { let len_before = self.l0_delta_layers.len(); let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers); l0_delta_layers.retain(|other| other.key() != layer_key); @@ -598,8 +599,9 @@ impl LayerMap { coverage } - pub fn is_l0(layer: &PersistentLayerDesc) -> bool { - layer.get_key_range() == (Key::MIN..Key::MAX) + /// Check if the key range resembles that of an L0 layer. + pub fn is_l0(key_range: &Range) -> bool { + key_range == &(Key::MIN..Key::MAX) } /// This function determines which layers are counted in `count_deltas`: @@ -626,7 +628,7 @@ impl LayerMap { /// than just the current partition_range. pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range) -> bool { // Case 1 - if !Self::is_l0(layer) { + if !Self::is_l0(&layer.key_range) { return true; } @@ -844,8 +846,8 @@ impl LayerMap { } /// Return all L0 delta layers - pub fn get_level0_deltas(&self) -> Result>> { - Ok(self.l0_delta_layers.to_vec()) + pub fn level0_deltas(&self) -> &Vec> { + &self.l0_delta_layers } /// debugging function to print out the contents of the layer map @@ -870,11 +872,183 @@ impl LayerMap { println!("End dump LayerMap"); Ok(()) } + + /// `read_points` represent the tip of a timeline and any branch points, i.e. the places + /// where we expect to serve reads. + /// + /// This function is O(N) and should be called infrequently. The caller is responsible for + /// looking up and updating the Layer objects for these layer descriptors. + pub fn get_visibility( + &self, + mut read_points: Vec, + ) -> ( + Vec<(Arc, LayerVisibilityHint)>, + KeySpace, + ) { + // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas + // KeySpace is intended to be composed statically and iterated over. + struct KeyShadow { + // Map of range start to range end + inner: RangeSetBlaze, + } + + impl KeyShadow { + fn new() -> Self { + Self { + inner: Default::default(), + } + } + + fn contains(&self, range: Range) -> bool { + let range_incl = range.start.to_i128()..=range.end.to_i128() - 1; + self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint( + CheckSortedDisjoint::from([range_incl]), + )) + } + + /// Add the input range to the keys covered by self. + /// + /// Return true if inserting this range covered some keys that were previously not covered + fn cover(&mut self, insert: Range) -> bool { + let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1; + self.inner.ranges_insert(range_incl) + } + + fn reset(&mut self) { + self.inner = Default::default(); + } + + fn to_keyspace(&self) -> KeySpace { + let mut accum = KeySpaceAccum::new(); + for range_incl in self.inner.ranges() { + let range = Range { + start: Key::from_i128(*range_incl.start()), + end: Key::from_i128(range_incl.end() + 1), + }; + accum.add_range(range) + } + + accum.to_keyspace() + } + } + + // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow, + // and a ReadPoint + read_points.sort_by_key(|rp| rp.0); + let mut shadow = KeyShadow::new(); + + // We will interleave all our read points and layers into a sorted collection + enum Item { + ReadPoint { lsn: Lsn }, + Layer(Arc), + } + + let mut items = Vec::with_capacity(self.historic.len() + read_points.len()); + items.extend(self.iter_historic_layers().map(Item::Layer)); + items.extend( + read_points + .into_iter() + .map(|rp| Item::ReadPoint { lsn: rp }), + ); + + // Ordering: we want to iterate like this: + // 1. Highest LSNs first + // 2. Consider images before deltas if they end at the same LSNs (images cover deltas) + // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible) + items.sort_by_key(|item| { + std::cmp::Reverse(match item { + Item::Layer(layer) => { + if layer.is_delta() { + (Lsn(layer.get_lsn_range().end.0 - 1), 0) + } else { + (layer.image_layer_lsn(), 1) + } + } + Item::ReadPoint { lsn } => (*lsn, 2), + }) + }); + + let mut results = Vec::with_capacity(self.historic.len()); + + let mut maybe_covered_deltas: Vec> = Vec::new(); + + for item in items { + let (reached_lsn, is_readpoint) = match &item { + Item::ReadPoint { lsn } => (lsn, true), + Item::Layer(layer) => (&layer.lsn_range.start, false), + }; + maybe_covered_deltas.retain(|d| { + if *reached_lsn >= d.lsn_range.start && is_readpoint { + // We encountered a readpoint within the delta layer: it is visible + + results.push((d.clone(), LayerVisibilityHint::Visible)); + false + } else if *reached_lsn < d.lsn_range.start { + // We passed the layer's range without encountering a read point: it is not visible + results.push((d.clone(), LayerVisibilityHint::Covered)); + false + } else { + // We're still in the delta layer: continue iterating + true + } + }); + + match item { + Item::ReadPoint { lsn: _lsn } => { + // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have + // to assume that the whole key range is visible at the branch point. + shadow.reset(); + } + Item::Layer(layer) => { + let visibility = if layer.is_delta() { + if shadow.contains(layer.get_key_range()) { + // If a layer isn't visible based on current state, we must defer deciding whether + // it is truly not visible until we have advanced past the delta's range: we might + // encounter another branch point within this delta layer's LSN range. + maybe_covered_deltas.push(layer); + continue; + } else { + LayerVisibilityHint::Visible + } + } else { + let modified = shadow.cover(layer.get_key_range()); + if modified { + // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered + LayerVisibilityHint::Visible + } else { + // An image layer in a region that was already covered + LayerVisibilityHint::Covered + } + }; + + results.push((layer, visibility)); + } + } + } + + // Drain any remaining maybe_covered deltas + results.extend( + maybe_covered_deltas + .into_iter() + .map(|d| (d, LayerVisibilityHint::Covered)), + ); + + (results, shadow.to_keyspace()) + } } #[cfg(test)] mod tests { - use pageserver_api::keyspace::KeySpace; + use crate::tenant::{storage_layer::LayerName, IndexPart}; + use pageserver_api::{ + key::DBDIR_KEY, + keyspace::{KeySpace, KeySpaceRandomAccum}, + }; + use std::{collections::HashMap, path::PathBuf}; + use utils::{ + id::{TenantId, TimelineId}, + shard::TenantShardId, + }; use super::*; @@ -1001,4 +1175,299 @@ mod tests { } } } + + #[test] + fn layer_visibility_basic() { + // A simple synthetic input, as a smoke test. + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); + let timeline_id = TimelineId::generate(); + let mut layer_map = LayerMap::default(); + let mut updates = layer_map.batch_update(); + + const FAKE_LAYER_SIZE: u64 = 1024; + + let inject_delta = |updates: &mut BatchedUpdates, + key_start: i128, + key_end: i128, + lsn_start: u64, + lsn_end: u64| { + let desc = PersistentLayerDesc::new_delta( + tenant_shard_id, + timeline_id, + Range { + start: Key::from_i128(key_start), + end: Key::from_i128(key_end), + }, + Range { + start: Lsn(lsn_start), + end: Lsn(lsn_end), + }, + 1024, + ); + updates.insert_historic(desc.clone()); + desc + }; + + let inject_image = + |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| { + let desc = PersistentLayerDesc::new_img( + tenant_shard_id, + timeline_id, + Range { + start: Key::from_i128(key_start), + end: Key::from_i128(key_end), + }, + Lsn(lsn), + FAKE_LAYER_SIZE, + ); + updates.insert_historic(desc.clone()); + desc + }; + + // + // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios + // we expect to handle. You can follow these examples through in the same order as they would be processed + // by the function under test. + // + + let mut read_points = vec![Lsn(1000)]; + + // A delta ahead of any image layer + let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110); + + // An image layer is visible and covers some layers beneath itself + let visible_covering_img = inject_image(&mut updates, 5, 25, 99); + + // A delta layer covered by the image layer: should be covered + let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100); + + // A delta layer partially covered by an image layer: should be visible + let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100); + + // A delta layer not covered by an image layer: should be visible + let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100); + + // An image layer covered by the image layer above: should be covered + let covered_image = inject_image(&mut updates, 10, 20, 89); + + // An image layer partially covered by an image layer: should be visible + let partially_covered_image = inject_image(&mut updates, 1, 7, 89); + + // An image layer not covered by an image layer: should be visible + let not_covered_image = inject_image(&mut updates, 1, 4, 89); + + // A read point: this will make subsequent layers below here visible, even if there are + // more recent layers covering them. + read_points.push(Lsn(80)); + + // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer + let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79); + + // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range: + // the read point should make it visible, even though its end LSN is covered + let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69); + let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69); + read_points.push(Lsn(65)); + let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69); + + let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65); + + updates.flush(); + + let (layer_visibilities, shadow) = layer_map.get_visibility(read_points); + let layer_visibilities = layer_visibilities.into_iter().collect::>(); + + assert_eq!( + layer_visibilities.get(&ahead_layer), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&visible_covering_img), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_delta), + Some(&LayerVisibilityHint::Covered) + ); + assert_eq!( + layer_visibilities.get(&partially_covered_delta), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(¬_covered_delta), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_image), + Some(&LayerVisibilityHint::Covered) + ); + assert_eq!( + layer_visibilities.get(&partially_covered_image), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(¬_covered_image), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_delta_below_read_point), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covering_img_between_read_points), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_delta_between_read_points), + Some(&LayerVisibilityHint::Covered) + ); + assert_eq!( + layer_visibilities.get(&covered_delta_intersects_read_point), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&visible_img_after_last_read_point), + Some(&LayerVisibilityHint::Visible) + ); + + // Shadow should include all the images below the last read point + let expected_shadow = KeySpace { + ranges: vec![Key::from_i128(10)..Key::from_i128(20)], + }; + assert_eq!(shadow, expected_shadow); + } + + fn fixture_path(relative: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) + } + + #[test] + fn layer_visibility_realistic() { + // Load a large example layermap + let index_raw = std::fs::read_to_string(fixture_path( + "test_data/indices/mixed_workload/index_part.json", + )) + .unwrap(); + let index: IndexPart = serde_json::from_str::(&index_raw).unwrap(); + + let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let timeline_id = TimelineId::generate(); + + let mut layer_map = LayerMap::default(); + let mut updates = layer_map.batch_update(); + for (layer_name, layer_metadata) in index.layer_metadata { + let layer_desc = match layer_name { + LayerName::Image(layer_name) => PersistentLayerDesc { + key_range: layer_name.key_range.clone(), + lsn_range: layer_name.lsn_as_range(), + tenant_shard_id, + timeline_id, + is_delta: false, + file_size: layer_metadata.file_size, + }, + LayerName::Delta(layer_name) => PersistentLayerDesc { + key_range: layer_name.key_range, + lsn_range: layer_name.lsn_range, + tenant_shard_id, + timeline_id, + is_delta: true, + file_size: layer_metadata.file_size, + }, + }; + updates.insert_historic(layer_desc); + } + updates.flush(); + + let read_points = vec![index.metadata.disk_consistent_lsn()]; + let (layer_visibilities, shadow) = layer_map.get_visibility(read_points); + for (layer_desc, visibility) in &layer_visibilities { + tracing::info!("{layer_desc:?}: {visibility:?}"); + eprintln!("{layer_desc:?}: {visibility:?}"); + } + + // The shadow should be non-empty, since there were some image layers + assert!(!shadow.ranges.is_empty()); + + // At least some layers should be marked covered + assert!(layer_visibilities + .iter() + .any(|i| matches!(i.1, LayerVisibilityHint::Covered))); + + let layer_visibilities = layer_visibilities.into_iter().collect::>(); + + // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it + for (layer_desc, visible) in &layer_visibilities { + let mut coverage = KeySpaceRandomAccum::new(); + let mut covered_by = Vec::new(); + + for other_layer in layer_map.iter_historic_layers() { + if &other_layer == layer_desc { + continue; + } + if !other_layer.is_delta() + && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1) + && other_layer.key_range.start <= layer_desc.key_range.end + && layer_desc.key_range.start <= other_layer.key_range.end + { + coverage.add_range(other_layer.get_key_range()); + covered_by.push((*other_layer).clone()); + } + } + let coverage = coverage.to_keyspace(); + + let expect_visible = if coverage.ranges.len() == 1 + && coverage.contains(&layer_desc.key_range.start) + && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1)) + { + LayerVisibilityHint::Covered + } else { + LayerVisibilityHint::Visible + }; + + if expect_visible != *visible { + eprintln!( + "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}", + layer_desc.key_range.start, + layer_desc.key_range.end, + layer_desc.lsn_range.start, + layer_desc.lsn_range.end, + layer_desc.is_delta() + ); + if expect_visible == LayerVisibilityHint::Covered { + eprintln!("Covered by:"); + for other in covered_by { + eprintln!( + " {}..{} @ {}", + other.get_key_range().start, + other.get_key_range().end, + other.image_layer_lsn() + ); + } + if let Some(range) = coverage.ranges.first() { + eprintln!( + "Total coverage from contributing layers: {}..{}", + range.start, range.end + ); + } else { + eprintln!( + "Total coverage from contributing layers: {:?}", + coverage.ranges + ); + } + } + } + assert_eq!(expect_visible, *visible); + } + + // Sanity: the layer that holds latest data for the DBDIR key should always be visible + // (just using this key as a key that will always exist for any layermap fixture) + let dbdir_layer = layer_map + .search(DBDIR_KEY, index.metadata.disk_consistent_lsn()) + .unwrap(); + assert!(matches!( + layer_visibilities.get(&dbdir_layer.layer).unwrap(), + LayerVisibilityHint::Visible + )); + } } diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index 347490c1ba..136f68bc36 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -521,6 +521,10 @@ impl BufferedHistoricLayerCoverage { Ok(&self.historic_coverage) } + + pub(crate) fn len(&self) -> usize { + self.layers.len() + } } #[test] diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 6ba1bdef9b..190316df42 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -111,7 +111,7 @@ impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader { #[error("re-serializing for crc32 failed")] struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError); -const METADATA_HDR_SIZE: usize = std::mem::size_of::(); +const METADATA_HDR_SIZE: usize = size_of::(); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct TimelineMetadataBodyV2 { @@ -285,12 +285,15 @@ impl TimelineMetadata { } /// When reparenting, the `ancestor_lsn` does not change. + /// + /// Returns true if anything was changed. pub fn reparent(&mut self, timeline: &TimelineId) { assert!(self.body.ancestor_timeline.is_some()); // no assertion for redoing this: it's fine, we may have to repeat this multiple times over self.body.ancestor_timeline = Some(*timeline); } + /// Returns true if anything was changed pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { if let Some(ancestor) = self.body.ancestor_timeline { assert_eq!(ancestor, branchpoint.0); @@ -562,7 +565,7 @@ mod tests { ); let expected_bytes = vec![ /* TimelineMetadataHeader */ - 4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2) + 74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2) /* TimelineMetadataBodyV2 */ 0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes) 1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes) @@ -571,7 +574,7 @@ mod tests { 0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes) 0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes) 0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes) - 0, 0, 0, 15, // pg_version (4 bytes) + 0, 0, 0, 16, // pg_version (4 bytes) /* padding bytes */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 4520bb9295..4e6ea0c8f9 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -3,7 +3,6 @@ use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; use futures::StreamExt; -use hyper::StatusCode; use itertools::Itertools; use pageserver_api::key::Key; use pageserver_api::models::LocationConfigMode; @@ -14,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; use std::borrow::Cow; use std::cmp::Ordering; -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::Deref; use std::sync::Arc; use std::time::Duration; @@ -27,8 +26,7 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; -use remote_storage::GenericRemoteStorage; -use utils::{completion, crashsafe}; +use utils::{backoff, completion, crashsafe}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; @@ -38,27 +36,26 @@ use crate::control_plane_client::{ use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; -use crate::task_mgr::{self, TaskKind}; +use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, }; -use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::storage_layer::inmemory_layer; use crate::tenant::timeline::ShutdownMode; -use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState}; -use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; +use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState}; +use crate::virtual_file::MaybeFatalIo; +use crate::{InitializationOrder, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; use utils::fs_ext::PathExt; use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; -use super::delete::DeleteTenantError; use super::remote_timeline_client::remote_tenant_path; use super::secondary::SecondaryTenant; -use super::timeline::detach_ancestor::PreparedTimelineDetach; -use super::TenantSharedResources; +use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; +use super::{GlobalShutDown, TenantSharedResources}; /// For a tenant that appears in TenantsMap, it may either be /// - `Attached`: has a full Tenant object, is elegible to service @@ -112,12 +109,6 @@ pub(crate) enum TenantsMap { ShuttingDown(BTreeMap), } -pub(crate) enum TenantsMapRemoveResult { - Occupied(TenantSlot), - Vacant, - InProgress(utils::completion::Barrier), -} - /// When resolving a TenantId to a shard, we may be looking for the 0th /// shard, or we might be looking for whichever shard holds a particular page. #[derive(Copy, Clone)] @@ -125,8 +116,6 @@ pub(crate) enum ShardSelector { /// Only return the 0th shard, if it is present. If a non-0th shard is present, /// ignore it. Zero, - /// Pick the first shard we find for the TenantId - First, /// Pick the shard that holds this key Page(Key), /// The shard ID is known: pick the given shard @@ -194,26 +183,6 @@ impl TenantsMap { } } - /// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map. - /// - /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded - /// slot if the enclosed tenant is shutdown. - pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult { - use std::collections::btree_map::Entry; - match self { - TenantsMap::Initializing => TenantsMapRemoveResult::Vacant, - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) { - Entry::Occupied(entry) => match entry.get() { - TenantSlot::InProgress(barrier) => { - TenantsMapRemoveResult::InProgress(barrier.clone()) - } - _ => TenantsMapRemoveResult::Occupied(entry.remove()), - }, - Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant, - }, - } - } - #[cfg(all(debug_assertions, not(test)))] pub(crate) fn len(&self) -> usize { match self { @@ -254,26 +223,60 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result> = @@ -299,10 +302,12 @@ pub struct TenantManager { // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or // when the tenant detaches. cancel: CancellationToken, + + background_purges: BackgroundPurges, } fn emergency_generations( - tenant_confs: &HashMap>, + tenant_confs: &HashMap>, ) -> HashMap { tenant_confs .iter() @@ -326,7 +331,7 @@ fn emergency_generations( async fn init_load_generations( conf: &'static PageServerConf, - tenant_confs: &HashMap>, + tenant_confs: &HashMap>, resources: &TenantSharedResources, cancel: &CancellationToken, ) -> anyhow::Result>> { @@ -376,62 +381,32 @@ async fn init_load_generations( /// Given a directory discovered in the pageserver's tenants/ directory, attempt /// to load a tenant config from it. /// -/// If file is missing, return Ok(None) +/// If we cleaned up something expected (like an empty dir or a temp dir), return None. fn load_tenant_config( conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, dentry: Utf8DirEntry, -) -> anyhow::Result)>> { +) -> Option> { let tenant_dir_path = dentry.path().to_path_buf(); if crate::is_temporary(&tenant_dir_path) { info!("Found temporary tenant directory, removing: {tenant_dir_path}"); // No need to use safe_remove_tenant_dir_all because this is already // a temporary path - if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) { - error!( - "Failed to remove temporary directory '{}': {:?}", - tenant_dir_path, e - ); - } - return Ok(None); + std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir"); + return None; } // This case happens if we crash during attachment before writing a config into the dir let is_empty = tenant_dir_path .is_empty_dir() - .with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?; + .fatal_err("Checking for empty tenant dir"); if is_empty { info!("removing empty tenant directory {tenant_dir_path:?}"); - if let Err(e) = std::fs::remove_dir(&tenant_dir_path) { - error!( - "Failed to remove empty tenant directory '{}': {e:#}", - tenant_dir_path - ) - } - return Ok(None); + std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir"); + return None; } - let tenant_shard_id = match tenant_dir_path - .file_name() - .unwrap_or_default() - .parse::() - { - Ok(id) => id, - Err(_) => { - warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",); - return Ok(None); - } - }; - - let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME); - if tenant_ignore_mark_file.exists() { - info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant"); - return Ok(None); - } - - Ok(Some(( - tenant_shard_id, - Tenant::load_tenant_config(conf, &tenant_shard_id), - ))) + Some(Tenant::load_tenant_config(conf, &tenant_shard_id)) } /// Initial stage of load: walk the local tenants directory, clean up any temp files, @@ -441,32 +416,63 @@ fn load_tenant_config( /// seconds even on reasonably fast drives. async fn init_load_tenant_configs( conf: &'static PageServerConf, -) -> anyhow::Result>> { +) -> HashMap> { let tenants_dir = conf.tenants_path(); - let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result> { - let dir_entries = tenants_dir - .read_dir_utf8() - .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?; + let dentries = tokio::task::spawn_blocking(move || -> Vec { + let context = format!("read tenants dir {tenants_dir}"); + let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context); - Ok(dir_entries.collect::, std::io::Error>>()?) + dir_entries + .collect::, std::io::Error>>() + .fatal_err(&context) }) - .await??; + .await + .expect("Config load task panicked"); let mut configs = HashMap::new(); let mut join_set = JoinSet::new(); for dentry in dentries { - join_set.spawn_blocking(move || load_tenant_config(conf, dentry)); + let tenant_shard_id = match dentry.file_name().parse::() { + Ok(id) => id, + Err(_) => { + warn!( + "Invalid tenant path (garbage in our repo directory?): '{}'", + dentry.file_name() + ); + continue; + } + }; + + join_set.spawn_blocking(move || { + ( + tenant_shard_id, + load_tenant_config(conf, tenant_shard_id, dentry), + ) + }); } while let Some(r) = join_set.join_next().await { - if let Some((tenant_id, tenant_config)) = r?? { - configs.insert(tenant_id, tenant_config); + let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task"); + if let Some(tenant_config) = tenant_config { + configs.insert(tenant_shard_id, tenant_config); } } - Ok(configs) + configs +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum DeleteTenantError { + #[error("Tenant map slot error {0}")] + SlotError(#[from] TenantSlotError), + + #[error("Cancelled")] + Cancelled, + + #[error(transparent)] + Other(#[from] anyhow::Error), } /// Initialize repositories with locally available timelines. @@ -475,6 +481,7 @@ async fn init_load_tenant_configs( #[instrument(skip_all)] pub async fn init_tenant_mgr( conf: &'static PageServerConf, + background_purges: BackgroundPurges, resources: TenantSharedResources, init_order: InitializationOrder, cancel: CancellationToken, @@ -496,7 +503,7 @@ pub async fn init_tenant_mgr( ); // Scan local filesystem for attached tenants - let tenant_configs = init_load_tenant_configs(conf).await?; + let tenant_configs = init_load_tenant_configs(conf).await; // Determine which tenants are to be secondary or attached, and in which generation let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; @@ -519,17 +526,8 @@ pub async fn init_tenant_mgr( let mut location_conf = match location_conf { Ok(l) => l, Err(e) => { - warn!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Marking tenant broken, failed to {e:#}"); - - tenants.insert( - tenant_shard_id, - TenantSlot::Attached(Tenant::create_broken_tenant( - conf, - tenant_shard_id, - resources.remote_storage.clone(), - format!("{}", e), - )), - ); + // This should only happen in the case of a serialization bug or critical local I/O error: we cannot load this tenant + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to load tenant config, failed to {e:#}"); continue; } }; @@ -549,7 +547,7 @@ pub async fn init_tenant_mgr( match safe_rename_tenant_dir(&tenant_dir_path).await { Ok(tmp_path) => { - spawn_background_purge(tmp_path); + background_purges.spawn(tmp_path); } Err(e) => { error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), @@ -623,14 +621,14 @@ pub async fn init_tenant_mgr( ); // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running for (tenant_shard_id, location_conf, config_write_result) in config_write_results { - // Errors writing configs are fatal - config_write_result?; + // Writing a config to local disk is foundational to startup up tenants: panic if we can't. + config_write_result.fatal_err("write tenant shard config file"); let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let shard_identity = location_conf.shard; let slot = match location_conf.mode { - LocationMode::Attached(attached_conf) => { - match tenant_spawn( + LocationMode::Attached(attached_conf) => TenantSlot::Attached( + tenant_spawn( conf, tenant_shard_id, &tenant_dir_path, @@ -638,17 +636,11 @@ pub async fn init_tenant_mgr( AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), shard_identity, Some(init_order.clone()), - &TENANTS, SpawnMode::Lazy, &ctx, - ) { - Ok(tenant) => TenantSlot::Attached(tenant), - Err(e) => { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); - continue; - } - } - } + ) + .expect("global shutdown during init_tenant_mgr cannot happen"), + ), LocationMode::Secondary(secondary_conf) => { info!( tenant_id = %tenant_shard_id.tenant_id, @@ -680,11 +672,11 @@ pub async fn init_tenant_mgr( tenants: &TENANTS, resources, cancel: CancellationToken::new(), + background_purges, }) } -/// Wrapper for Tenant::spawn that checks invariants before running, and inserts -/// a broken tenant in the map if Tenant::spawn fails. +/// Wrapper for Tenant::spawn that checks invariants before running #[allow(clippy::too_many_arguments)] fn tenant_spawn( conf: &'static PageServerConf, @@ -694,51 +686,29 @@ fn tenant_spawn( location_conf: AttachedTenantConf, shard_identity: ShardIdentity, init_order: Option, - tenants: &'static std::sync::RwLock, mode: SpawnMode, ctx: &RequestContext, -) -> anyhow::Result> { - anyhow::ensure!( - tenant_path.is_dir(), - "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory" - ); - anyhow::ensure!( - !crate::is_temporary(tenant_path), - "Cannot load tenant from temporary path {tenant_path:?}" - ); - anyhow::ensure!( - !tenant_path.is_empty_dir().with_context(|| { - format!("Failed to check whether {tenant_path:?} is an empty dir") - })?, - "Cannot load tenant from empty directory {tenant_path:?}" - ); +) -> Result, GlobalShutDown> { + // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed + // path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode + // to avoid impacting prod runtime performance. + assert!(!crate::is_temporary(tenant_path)); + debug_assert!(tenant_path.is_dir()); + debug_assert!(conf + .tenant_location_config_path(&tenant_shard_id) + .try_exists() + .unwrap()); - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); - anyhow::ensure!( - !conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(), - "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" - ); - - let remote_storage = resources.remote_storage.clone(); - let tenant = match Tenant::spawn( + Tenant::spawn( conf, tenant_shard_id, resources, location_conf, shard_identity, init_order, - tenants, mode, ctx, - ) { - Ok(tenant) => tenant, - Err(e) => { - error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}"); - Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}")) - } - }; - - Ok(tenant) + ) } async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { @@ -889,8 +859,9 @@ pub(crate) enum UpsertLocationError { #[error("Failed to flush: {0}")] Flush(anyhow::Error), + /// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state. #[error("Internal error: {0}")] - Other(#[from] anyhow::Error), + InternalError(anyhow::Error), } impl TenantManager { @@ -1020,7 +991,8 @@ impl TenantManager { match fast_path_taken { Some(FastPathModified::Attached(tenant)) => { Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await?; + .await + .fatal_err("write tenant shard config"); // Transition to AttachedStale means we may well hold a valid generation // still, and have been requested to go stale as part of a migration. If @@ -1050,7 +1022,8 @@ impl TenantManager { } Some(FastPathModified::Secondary(_secondary_tenant)) => { Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await?; + .await + .fatal_err("write tenant shard config"); return Ok(None); } @@ -1067,7 +1040,7 @@ impl TenantManager { // not do significant I/O, and shutdowns should be prompt via cancellation tokens. let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any) .map_err(|e| match e { - TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => { + TenantSlotError::NotFound(_) => { unreachable!("Called with mode Any") } TenantSlotError::InProgress => UpsertLocationError::InProgress, @@ -1116,7 +1089,7 @@ impl TenantManager { Some(TenantSlot::InProgress(_)) => { // This should never happen: acquire_slot should error out // if the contents of a slot were InProgress. - return Err(UpsertLocationError::Other(anyhow::anyhow!( + return Err(UpsertLocationError::InternalError(anyhow::anyhow!( "Acquired an InProgress slot, this is a bug." ))); } @@ -1135,12 +1108,14 @@ impl TenantManager { // Does not need to be fsync'd because local storage is just a cache. tokio::fs::create_dir_all(&timelines_path) .await - .with_context(|| format!("Creating {timelines_path}"))?; + .fatal_err("create timelines/ dir"); // Before activating either secondary or attached mode, persist the // configuration, so that on restart we will re-attach (or re-start // secondary) on the tenant. - Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?; + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .fatal_err("write tenant shard config"); let new_slot = match &new_location_config.mode { LocationMode::Secondary(secondary_config) => { @@ -1159,13 +1134,15 @@ impl TenantManager { // from upserts. This enables creating generation-less tenants even though neon_local // always uses generations when calling the location conf API. let attached_conf = if cfg!(feature = "testing") { - let mut conf = AttachedTenantConf::try_from(new_location_config)?; + let mut conf = AttachedTenantConf::try_from(new_location_config) + .map_err(UpsertLocationError::BadRequest)?; if self.conf.control_plane_api.is_none() { conf.location.generation = Generation::none(); } conf } else { - AttachedTenantConf::try_from(new_location_config)? + AttachedTenantConf::try_from(new_location_config) + .map_err(UpsertLocationError::BadRequest)? }; let tenant = tenant_spawn( @@ -1176,10 +1153,12 @@ impl TenantManager { attached_conf, shard_identity, None, - self.tenants, spawn_mode, ctx, - )?; + ) + .map_err(|_: GlobalShutDown| { + UpsertLocationError::Unavailable(TenantMapError::ShuttingDown) + })?; TenantSlot::Attached(tenant) } @@ -1193,7 +1172,7 @@ impl TenantManager { match slot_guard.upsert(new_slot) { Err(TenantSlotUpsertError::InternalError(e)) => { - Err(UpsertLocationError::Other(anyhow::anyhow!(e))) + Err(UpsertLocationError::InternalError(anyhow::anyhow!(e))) } Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)), Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => { @@ -1298,7 +1277,6 @@ impl TenantManager { AttachedTenantConf::try_from(config)?, shard_identity, None, - self.tenants, SpawnMode::Eager, ctx, )?; @@ -1367,30 +1345,67 @@ impl TenantManager { } } + async fn delete_tenant_remote( + &self, + tenant_shard_id: TenantShardId, + ) -> Result<(), DeleteTenantError> { + let remote_path = remote_tenant_path(&tenant_shard_id); + let mut keys_stream = self.resources.remote_storage.list_streaming( + Some(&remote_path), + remote_storage::ListingMode::NoDelimiter, + None, + &self.cancel, + ); + while let Some(chunk) = keys_stream.next().await { + let keys = match chunk { + Ok(listing) => listing.keys, + Err(remote_storage::DownloadError::Cancelled) => { + return Err(DeleteTenantError::Cancelled) + } + Err(remote_storage::DownloadError::NotFound) => return Ok(()), + Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))), + }; + + if keys.is_empty() { + tracing::info!("Remote storage already deleted"); + } else { + tracing::info!("Deleting {} keys from remote storage", keys.len()); + let keys = keys.into_iter().map(|o| o.key).collect::>(); + self.resources + .remote_storage + .delete_objects(&keys, &self.cancel) + .await?; + } + } + + Ok(()) + } + + /// If a tenant is attached, detach it. Then remove its data from remote storage. + /// + /// A tenant is considered deleted once it is gone from remote storage. It is the caller's + /// responsibility to avoid trying to attach the tenant again or use it any way once deletion + /// has started: this operation is not atomic, and must be retried until it succeeds. pub(crate) async fn delete_tenant( &self, tenant_shard_id: TenantShardId, - activation_timeout: Duration, - ) -> Result { + ) -> Result<(), DeleteTenantError> { super::span::debug_assert_current_span_has_tenant_id(); - // We acquire a SlotGuard during this function to protect against concurrent - // changes while the ::prepare phase of DeleteTenantFlow executes, but then - // have to return the Tenant to the map while the background deletion runs. - // - // TODO: refactor deletion to happen outside the lifetime of a Tenant. - // Currently, deletion requires a reference to the tenants map in order to - // keep the Tenant in the map until deletion is complete, and then remove - // it at the end. - // - // See https://github.com/neondatabase/neon/issues/5080 - // Tenant deletion can happen two ways: - // - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping - // state until deletion is complete. - // - New: called on a pageserver without an attached location. We proceed with deletion from - // remote storage. - // - // See https://github.com/neondatabase/neon/issues/5080 for more context on this transition. + async fn delete_local( + conf: &PageServerConf, + background_purges: &BackgroundPurges, + tenant_shard_id: &TenantShardId, + ) -> anyhow::Result<()> { + let local_tenant_directory = conf.tenant_path(tenant_shard_id); + let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| { + format!("local tenant directory {local_tenant_directory:?} rename") + })?; + background_purges.spawn(tmp_dir); + Ok(()) + } let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; match &slot_guard.old_value { @@ -1398,109 +1413,49 @@ impl TenantManager { // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and // deletion will be resumed across restarts. let tenant = tenant.clone(); - return self - .delete_tenant_attached(slot_guard, tenant, activation_timeout) - .await; + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => {} + Err(barrier) => { + info!("Shutdown already in progress, waiting for it to complete"); + barrier.wait().await; + } + } + delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?; } Some(TenantSlot::Secondary(secondary_tenant)) => { secondary_tenant.shutdown().await; - let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id); - let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory) - .await - .with_context(|| { - format!("local tenant directory {local_tenant_directory:?} rename") - })?; - spawn_background_purge(tmp_dir); + + delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?; } Some(TenantSlot::InProgress(_)) => unreachable!(), None => {} }; - // Fall through: local state for this tenant is no longer present, proceed with remote delete - let remote_path = remote_tenant_path(&tenant_shard_id); - let keys = match self - .resources - .remote_storage - .list( - Some(&remote_path), - remote_storage::ListingMode::NoDelimiter, - None, - &self.cancel, - ) - .await - { - Ok(listing) => listing.keys, - Err(remote_storage::DownloadError::Cancelled) => { - return Err(DeleteTenantError::Cancelled) - } - Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND), - Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))), - }; - - if keys.is_empty() { - tracing::info!("Remote storage already deleted"); - } else { - tracing::info!("Deleting {} keys from remote storage", keys.len()); - self.resources - .remote_storage - .delete_objects(&keys, &self.cancel) - .await?; - } - - // Callers use 404 as success for deletions, for historical reasons. - Ok(StatusCode::NOT_FOUND) - } - - async fn delete_tenant_attached( - &self, - slot_guard: SlotGuard, - tenant: Arc, - activation_timeout: Duration, - ) -> Result { - match tenant.current_state() { - TenantState::Broken { .. } | TenantState::Stopping { .. } => { - // If deletion is already in progress, return success (the semantics of this - // function are to rerturn success afterr deletion is spawned in background). - // Otherwise fall through and let [`DeleteTenantFlow`] handle this state. - if DeleteTenantFlow::is_in_progress(&tenant) { - // The `delete_progress` lock is held: deletion is already happening - // in the bacckground - slot_guard.revert(); - return Ok(StatusCode::ACCEPTED); + // Fall through: local state for this tenant is no longer present, proceed with remote delete. + // - We use a retry wrapper here so that common transient S3 errors (e.g. 503, 429) do not result + // in 500 responses to delete requests. + // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will + // 503/retry, rather than kicking off a wasteful concurrent deletion. + match backoff::retry( + || async move { self.delete_tenant_remote(tenant_shard_id).await }, + |e| match e { + DeleteTenantError::Cancelled => true, + DeleteTenantError::SlotError(_) => { + unreachable!("Remote deletion doesn't touch slots") } - } - _ => { - tenant - .wait_to_become_active(activation_timeout) - .await - .map_err(|e| match e { - GetActiveTenantError::WillNotBecomeActive(_) - | GetActiveTenantError::Broken(_) => { - DeleteTenantError::InvalidState(tenant.current_state()) - } - GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled, - GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached, - GetActiveTenantError::WaitForActiveTimeout { - latest_state: _latest_state, - wait_time: _wait_time, - } => DeleteTenantError::InvalidState(tenant.current_state()), - })?; - } - } - - let result = DeleteTenantFlow::run( - self.conf, - self.resources.remote_storage.clone(), - &TENANTS, - tenant, + _ => false, + }, + 1, + 3, + &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"), &self.cancel, ) - .await; - - // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow - slot_guard.revert(); - let () = result?; - Ok(StatusCode::ACCEPTED) + .await + { + Some(r) => r, + None => Err(DeleteTenantError::Cancelled), + } } #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))] @@ -1672,7 +1627,7 @@ impl TenantManager { for child_shard_id in &child_shards { let child_shard_id = *child_shard_id; let child_shard = { - let locked = TENANTS.read().unwrap(); + let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?; peek_slot.and_then(|s| s.get_attached()).cloned() @@ -1742,7 +1697,7 @@ impl TenantManager { let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; - spawn_background_purge(tmp_path); + self.background_purges.spawn(tmp_path); fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( "failpoint" @@ -1773,14 +1728,10 @@ impl TenantManager { let timelines = parent_shard.timelines.lock().unwrap().clone(); let parent_timelines = timelines.keys().cloned().collect::>(); for timeline in timelines.values() { - let timeline_layers = timeline - .layers - .read() - .await - .likely_resident_layers() - .collect::>(); + tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink"); + let layers = timeline.layers.read().await; - for layer in timeline_layers { + for layer in layers.likely_resident_layers() { let relative_path = layer .local_path() .strip_prefix(&parent_path) @@ -1812,7 +1763,12 @@ impl TenantManager { // Since we will do a large number of small filesystem metadata operations, batch them into // spawn_blocking calls rather than doing each one as a tokio::fs round-trip. + let span = tracing::Span::current(); let jh = tokio::task::spawn_blocking(move || -> anyhow::Result { + // Run this synchronous code in the same log context as the outer function that spawned it. + let _span = span.enter(); + + tracing::info!("Creating {} directories", create_dirs.len()); for dir in &create_dirs { if let Err(e) = std::fs::create_dir_all(dir) { // Ignore AlreadyExists errors, drop out on all other errors @@ -1826,6 +1782,11 @@ impl TenantManager { } for child_prefix in child_prefixes { + tracing::info!( + "Hard-linking {} parent layers into child path {}", + parent_layers.len(), + child_prefix + ); for relative_layer in &parent_layers { let parent_path = parent_path.join(relative_layer); let child_path = child_prefix.join(relative_layer); @@ -1851,6 +1812,7 @@ impl TenantManager { // Durability is not required for correctness, but if we crashed during split and // then came restarted with empty timeline dirs, it would be very inefficient to // re-populate from remote storage. + tracing::info!("fsyncing {} directories", create_dirs.len()); for dir in create_dirs { if let Err(e) = crashsafe::fsync(&dir) { // Something removed a newly created timeline dir out from underneath us? Extremely @@ -1901,19 +1863,12 @@ impl TenantManager { &self, conf: &'static PageServerConf, tenant_shard_id: TenantShardId, - detach_ignored: bool, deletion_queue_client: &DeletionQueueClient, ) -> Result<(), TenantStateError> { let tmp_path = self - .detach_tenant0( - conf, - &TENANTS, - tenant_shard_id, - detach_ignored, - deletion_queue_client, - ) + .detach_tenant0(conf, tenant_shard_id, deletion_queue_client) .await?; - spawn_background_purge(tmp_path); + self.background_purges.spawn(tmp_path); Ok(()) } @@ -1921,9 +1876,7 @@ impl TenantManager { async fn detach_tenant0( &self, conf: &'static PageServerConf, - tenants: &std::sync::RwLock, tenant_shard_id: TenantShardId, - detach_ignored: bool, deletion_queue_client: &DeletionQueueClient, ) -> Result { let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { @@ -1936,7 +1889,7 @@ impl TenantManager { }; let removal_result = remove_tenant_from_memory( - tenants, + self.tenants, tenant_shard_id, tenant_dir_rename_operation(tenant_shard_id), ) @@ -1946,33 +1899,13 @@ impl TenantManager { // before this tenant is potentially re-attached elsewhere. deletion_queue_client.flush_advisory(); - // Ignored tenants are not present in memory and will bail the removal from memory operation. - // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. - if detach_ignored - && matches!( - removal_result, - Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) - ) - { - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); - if tenant_ignore_mark.exists() { - info!("Detaching an ignored tenant"); - let tmp_path = tenant_dir_rename_operation(tenant_shard_id) - .await - .with_context(|| { - format!("Ignored tenant {tenant_shard_id} local directory rename") - })?; - return Ok(tmp_path); - } - } - removal_result } pub(crate) fn list_tenants( &self, ) -> Result, TenantMapListError> { - let tenants = TENANTS.read().unwrap(); + let tenants = self.tenants.read().unwrap(); let m = match &*tenants { TenantsMap::Initializing => return Err(TenantMapListError::Initializing), TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, @@ -1994,93 +1927,149 @@ impl TenantManager { tenant_shard_id: TenantShardId, timeline_id: TimelineId, prepared: PreparedTimelineDetach, + mut attempt: detach_ancestor::Attempt, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - struct RevertOnDropSlot(Option); + ) -> Result, detach_ancestor::Error> { + use detach_ancestor::Error; - impl Drop for RevertOnDropSlot { - fn drop(&mut self) { - if let Some(taken) = self.0.take() { - taken.revert(); - } - } - } + let slot_guard = + tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist).map_err( + |e| { + use TenantSlotError::*; - impl RevertOnDropSlot { - fn into_inner(mut self) -> SlotGuard { - self.0.take().unwrap() - } - } - - impl std::ops::Deref for RevertOnDropSlot { - type Target = SlotGuard; - - fn deref(&self) -> &Self::Target { - self.0.as_ref().unwrap() - } - } - - let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; - let slot_guard = RevertOnDropSlot(Some(slot_guard)); + match e { + MapState(TenantMapError::ShuttingDown) => Error::ShuttingDown, + NotFound(_) | InProgress | MapState(_) => Error::DetachReparent(e.into()), + } + }, + )?; let tenant = { - let Some(old_slot) = slot_guard.get_old_value() else { - anyhow::bail!( - "Tenant not found when trying to complete detaching timeline ancestor" - ); - }; + let old_slot = slot_guard + .get_old_value() + .as_ref() + .expect("requested MustExist"); let Some(tenant) = old_slot.get_attached() else { - anyhow::bail!("Tenant is not in attached state"); + return Err(Error::DetachReparent(anyhow::anyhow!( + "Tenant is not in attached state" + ))); }; if !tenant.is_active() { - anyhow::bail!("Tenant is not active"); + return Err(Error::DetachReparent(anyhow::anyhow!( + "Tenant is not active" + ))); } tenant.clone() }; - let timeline = tenant.get_timeline(timeline_id, true)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(Error::NotFound)?; - let reparented = timeline - .complete_detaching_timeline_ancestor(&tenant, prepared, ctx) + let resp = timeline + .detach_from_ancestor_and_reparent(&tenant, prepared, ctx) .await?; - let mut slot_guard = slot_guard.into_inner(); + let mut slot_guard = slot_guard; - let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, ShutdownMode::Hard).await { - Ok(()) => { - slot_guard.drop_old_value()?; + let tenant = if resp.reset_tenant_required() { + attempt.before_reset_tenant(); + + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => { + slot_guard.drop_old_value().expect("it was just shutdown"); + } + Err(_barrier) => { + slot_guard.revert(); + // this really should not happen, at all, unless a shutdown without acquiring + // tenant slot was already going? regardless, on restart the attempt tracking + // will reset to retryable. + return Err(Error::ShuttingDown); + } } - Err(_barrier) => { - slot_guard.revert(); - // this really should not happen, at all, unless shutdown was already going? - anyhow::bail!("Cannot restart Tenant, already shutting down"); + + let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id) + .map_err(|e| Error::DetachReparent(e.into()))?; + + let shard_identity = config.shard; + let tenant = tenant_spawn( + self.conf, + tenant_shard_id, + &tenant_path, + self.resources.clone(), + AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?, + shard_identity, + None, + SpawnMode::Eager, + ctx, + ) + .map_err(|_| Error::ShuttingDown)?; + + { + let mut g = tenant.ongoing_timeline_detach.lock().unwrap(); + assert!( + g.is_none(), + "there cannot be any new timeline detach ancestor on newly created tenant" + ); + *g = Some((attempt.timeline_id, attempt.new_barrier())); } + + // if we bail out here, we will not allow a new attempt, which should be fine. + // pageserver should be shutting down regardless? tenant_reset would help, unless it + // runs into the same problem. + slot_guard + .upsert(TenantSlot::Attached(tenant.clone())) + .map_err(|e| match e { + TenantSlotUpsertError::ShuttingDown(_) => Error::ShuttingDown, + other => Error::DetachReparent(other.into()), + })?; + tenant + } else { + tracing::info!("skipping tenant_reset as no changes made required it"); + tenant + }; + + if let Some(reparented) = resp.completed() { + // finally ask the restarted tenant to complete the detach + // + // rationale for 9999s: we don't really have a timetable here; if retried, the caller + // will get an 503. + tenant + .wait_to_become_active(std::time::Duration::from_secs(9999)) + .await + .map_err(|e| { + use pageserver_api::models::TenantState; + use GetActiveTenantError::{Cancelled, WillNotBecomeActive}; + match e { + Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => { + Error::ShuttingDown + } + other => Error::Complete(other.into()), + } + })?; + + utils::pausable_failpoint!( + "timeline-detach-ancestor::after_activating_before_finding-pausable" + ); + + let timeline = tenant + .get_timeline(attempt.timeline_id, true) + .map_err(Error::NotFound)?; + + timeline + .complete_detaching_timeline_ancestor(&tenant, attempt, ctx) + .await + .map(|()| reparented) + } else { + // at least the latest versions have now been downloaded and refreshed; be ready to + // retry another time. + Err(Error::FailedToReparentAll) } - - let tenant_path = self.conf.tenant_path(&tenant_shard_id); - let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; - - let shard_identity = config.shard; - let tenant = tenant_spawn( - self.conf, - tenant_shard_id, - &tenant_path, - self.resources.clone(), - AttachedTenantConf::try_from(config)?, - shard_identity, - None, - self.tenants, - SpawnMode::Eager, - ctx, - )?; - - slot_guard.upsert(TenantSlot::Attached(tenant))?; - - Ok(reparented) } /// A page service client sends a TenantId, and to look up the correct Tenant we must @@ -2117,7 +2106,6 @@ impl TenantManager { }; match selector { - ShardSelector::First => return ShardResolveResult::Found(tenant.clone()), ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { return ShardResolveResult::Found(tenant.clone()) } @@ -2153,6 +2141,57 @@ impl TenantManager { } } } + + /// Calculate the tenant shards' contributions to this pageserver's utilization metrics. The + /// returned values are: + /// - the number of bytes of local disk space this pageserver's shards are requesting, i.e. + /// how much space they would use if not impacted by disk usage eviction. + /// - the number of tenant shards currently on this pageserver, including attached + /// and secondary. + /// + /// This function is quite expensive: callers are expected to cache the result and + /// limit how often they call it. + pub(crate) fn calculate_utilization(&self) -> Result<(u64, u32), TenantMapListError> { + let tenants = self.tenants.read().unwrap(); + let m = match &*tenants { + TenantsMap::Initializing => return Err(TenantMapListError::Initializing), + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, + }; + let shard_count = m.len(); + let mut wanted_bytes = 0; + + for tenant_slot in m.values() { + match tenant_slot { + TenantSlot::InProgress(_barrier) => { + // While a slot is being changed, we can't know how much storage it wants. This + // means this function's output can fluctuate if a lot of changes are going on + // (such as transitions from secondary to attached). + // + // We could wait for the barrier and retry, but it's important that the utilization + // API is responsive, and the data quality impact is not very significant. + continue; + } + TenantSlot::Attached(tenant) => { + wanted_bytes += tenant.local_storage_wanted(); + } + TenantSlot::Secondary(secondary) => { + let progress = secondary.progress.lock().unwrap(); + wanted_bytes += if progress.heatmap_mtime.is_some() { + // If we have heatmap info, then we will 'want' the sum + // of the size of layers in the heatmap: this is how much space + // we would use if not doing any eviction. + progress.bytes_total + } else { + // In the absence of heatmap info, assume that the secondary location simply + // needs as much space as it is currently using. + secondary.resident_size_metric.get() + } + } + } + } + + Ok((wanted_bytes, shard_count as u32)) + } } #[derive(Debug, thiserror::Error)] @@ -2199,6 +2238,9 @@ pub(crate) enum GetActiveTenantError { /// never happen. #[error("Tenant is broken: {0}")] Broken(String), + + #[error("reconnect to switch tenant id")] + SwitchedTenant, } #[derive(Debug, thiserror::Error)] @@ -2222,97 +2264,6 @@ pub(crate) enum TenantStateError { Other(#[from] anyhow::Error), } -pub(crate) async fn load_tenant( - conf: &'static PageServerConf, - tenant_id: TenantId, - generation: Generation, - broker_client: storage_broker::BrokerClientChannel, - remote_storage: GenericRemoteStorage, - deletion_queue_client: DeletionQueueClient, - ctx: &RequestContext, -) -> Result<(), TenantMapInsertError> { - // This is a legacy API (replaced by `/location_conf`). It does not support sharding - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - let slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; - let tenant_path = conf.tenant_path(&tenant_shard_id); - - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); - if tenant_ignore_mark.exists() { - std::fs::remove_file(&tenant_ignore_mark).with_context(|| { - format!( - "Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading" - ) - })?; - } - - let resources = TenantSharedResources { - broker_client, - remote_storage, - deletion_queue_client, - }; - - let mut location_conf = - Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?; - location_conf.attach_in_generation(AttachmentMode::Single, generation); - - Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; - - let shard_identity = location_conf.shard; - let new_tenant = tenant_spawn( - conf, - tenant_shard_id, - &tenant_path, - resources, - AttachedTenantConf::try_from(location_conf)?, - shard_identity, - None, - &TENANTS, - SpawnMode::Eager, - ctx, - ) - .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?; - - slot_guard.upsert(TenantSlot::Attached(new_tenant))?; - Ok(()) -} - -pub(crate) async fn ignore_tenant( - conf: &'static PageServerConf, - tenant_id: TenantId, -) -> Result<(), TenantStateError> { - ignore_tenant0(conf, &TENANTS, tenant_id).await -} - -#[instrument(skip_all, fields(shard_id))] -async fn ignore_tenant0( - conf: &'static PageServerConf, - tenants: &std::sync::RwLock, - tenant_id: TenantId, -) -> Result<(), TenantStateError> { - // This is a legacy API (replaced by `/location_conf`). It does not support sharding - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - tracing::Span::current().record( - "shard_id", - tracing::field::display(tenant_shard_id.shard_slug()), - ); - - remove_tenant_from_memory(tenants, tenant_shard_id, async { - let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id); - fs::File::create(&ignore_mark_file) - .await - .context("Failed to create ignore mark file") - .and_then(|_| { - crashsafe::fsync_file_and_parent(&ignore_mark_file) - .context("Failed to fsync ignore mark file") - }) - .with_context(|| format!("Failed to crate ignore mark for tenant {tenant_shard_id}"))?; - Ok(()) - }) - .await -} - #[derive(Debug, thiserror::Error)] pub(crate) enum TenantMapListError { #[error("tenant map is still initiailizing")] @@ -2337,10 +2288,6 @@ pub(crate) enum TenantSlotError { #[error("Tenant {0} not found")] NotFound(TenantShardId), - /// When acquiring a slot with the expectation that the tenant does not already exist. - #[error("tenant {0} already exists, state: {1:?}")] - AlreadyExists(TenantShardId, TenantState), - // Tried to read a slot that is currently being mutated by another administrative // operation. #[error("tenant has a state change in progress, try again later")] @@ -2444,6 +2391,9 @@ impl SlotGuard { /// Get any value that was present in the slot before we acquired ownership /// of it: in state transitions, this will be the old state. + /// + // FIXME: get_ prefix + // FIXME: this should be .as_ref() -- unsure why no clippy fn get_old_value(&self) -> &Option { &self.old_value } @@ -2656,8 +2606,6 @@ enum TenantSlotAcquireMode { Any, /// Return an error if trying to acquire a slot and it doesn't already exist MustExist, - /// Return an error if trying to acquire a slot and it already exists - MustNotExist, } fn tenant_map_acquire_slot( @@ -2711,27 +2659,6 @@ fn tenant_map_acquire_slot_impl( tracing::debug!("Occupied, failing for InProgress"); Err(TenantSlotError::InProgress) } - (slot, MustNotExist) => match slot { - TenantSlot::Attached(tenant) => { - tracing::debug!("Attached && MustNotExist, return AlreadyExists"); - Err(TenantSlotError::AlreadyExists( - *tenant_shard_id, - tenant.current_state(), - )) - } - _ => { - // FIXME: the AlreadyExists error assumes that we have a Tenant - // to get the state from - tracing::debug!("Occupied & MustNotExist, return AlreadyExists"); - Err(TenantSlotError::AlreadyExists( - *tenant_shard_id, - TenantState::Broken { - reason: "Present but not attached".to_string(), - backtrace: "".to_string(), - }, - )) - } - }, _ => { // Happy case: the slot was not in any state that violated our mode let (completion, barrier) = utils::completion::channel(); @@ -2921,7 +2848,9 @@ mod tests { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully // wait for it to complete before proceeding. - let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap(); + let h = TenantHarness::create("shutdown_awaits_in_progress_tenant") + .await + .unwrap(); let (t, _ctx) = h.load().await; // harness loads it to active, which is forced and nothing is running on the tenant diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index e33e4b84aa..71b766e4c7 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -187,7 +187,7 @@ use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; pub(crate) use download::download_initdb_tar_zst; -use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState}; use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -241,7 +241,7 @@ use self::index::IndexPart; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; -use super::upload_queue::SetDeletedFlagProgress; +use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::Generation; pub(crate) use download::{ @@ -287,6 +287,14 @@ pub enum PersistIndexPartWithDeletedFlagError { Other(#[from] anyhow::Error), } +#[derive(Debug, thiserror::Error)] +pub enum WaitCompletionError { + #[error(transparent)] + NotInitialized(NotInitialized), + #[error("wait_completion aborted because upload queue was stopped")] + UploadQueueShutDownOrStopped, +} + /// A client for accessing a timeline's data in remote storage. /// /// This takes care of managing the number of connections, and balancing them @@ -449,6 +457,17 @@ impl RemoteTimelineClient { .unwrap_or(false) } + /// Returns whether the timeline is archived. + /// Return None if the remote index_part hasn't been downloaded yet. + pub(crate) fn is_archived(&self) -> Option { + self.upload_queue + .lock() + .unwrap() + .initialized_mut() + .map(|q| q.clean.0.archived_at.is_some()) + .ok() + } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { current_remote_index_part @@ -519,7 +538,7 @@ impl RemoteTimelineClient { local_path: &Utf8Path, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Layer, @@ -609,7 +628,7 @@ impl RemoteTimelineClient { Ok(()) } - /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated. + /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated. pub(crate) fn schedule_index_upload_for_aux_file_policy_update( self: &Arc, last_aux_file_policy: Option, @@ -620,6 +639,48 @@ impl RemoteTimelineClient { self.schedule_index_upload(upload_queue)?; Ok(()) } + + /// Launch an index-file upload operation in the background, with only the `archived_at` field updated. + /// + /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded, + /// so either if the change is already sitting in the queue, but not commited yet, or the change has not + /// been in the queue yet. + pub(crate) fn schedule_index_upload_for_timeline_archival_state( + self: &Arc, + state: TimelineArchivalState, + ) -> anyhow::Result { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + /// Returns Some(_) if a change is needed, and Some(true) if it's a + /// change needed to set archived_at. + fn need_change( + archived_at: &Option, + state: TimelineArchivalState, + ) -> Option { + match (archived_at, state) { + (Some(_), TimelineArchivalState::Archived) + | (None, TimelineArchivalState::Unarchived) => { + // Nothing to do + tracing::info!("intended state matches present state"); + None + } + (None, TimelineArchivalState::Archived) => Some(true), + (Some(_), TimelineArchivalState::Unarchived) => Some(false), + } + } + let need_upload_scheduled = need_change(&upload_queue.dirty.archived_at, state); + + if let Some(archived_at_set) = need_upload_scheduled { + let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc()); + upload_queue.dirty.archived_at = intended_archived_at; + self.schedule_index_upload(upload_queue)?; + } + + let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some(); + Ok(need_wait) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -630,7 +691,7 @@ impl RemoteTimelineClient { /// /// Like schedule_index_upload_for_metadata_update(), this merely adds /// the upload to the upload queue and returns quickly. - pub fn schedule_index_upload_for_file_changes(self: &Arc) -> anyhow::Result<()> { + pub fn schedule_index_upload_for_file_changes(self: &Arc) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -645,7 +706,7 @@ impl RemoteTimelineClient { fn schedule_index_upload( self: &Arc, upload_queue: &mut UploadQueueInitialized, - ) -> anyhow::Result<()> { + ) -> Result<(), NotInitialized> { let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); // fix up the duplicated field upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn; @@ -653,7 +714,7 @@ impl RemoteTimelineClient { // make sure it serializes before doing it in perform_upload_task so that it doesn't // look like a retryable error let void = std::io::sink(); - serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?; + serde_json::to_writer(void, &upload_queue.dirty).expect("serialize index_part.json"); let index_part = &upload_queue.dirty; @@ -675,12 +736,13 @@ impl RemoteTimelineClient { Ok(()) } + /// Reparent this timeline to a new parent. + /// + /// A retryable step of timeline ancestor detach. pub(crate) async fn schedule_reparenting_and_wait( self: &Arc, new_parent: &TimelineId, ) -> anyhow::Result<()> { - // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing - // and reads the in-memory part we cannot do the detaching like this let receiver = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -691,15 +753,25 @@ impl RemoteTimelineClient { )); }; - upload_queue.dirty.metadata.reparent(new_parent); - upload_queue.dirty.lineage.record_previous_ancestor(&prev); + let uploaded = &upload_queue.clean.0.metadata; - self.schedule_index_upload(upload_queue)?; + if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() { + // nothing to do + None + } else { + upload_queue.dirty.metadata.reparent(new_parent); + upload_queue.dirty.lineage.record_previous_ancestor(&prev); - self.schedule_barrier0(upload_queue) + self.schedule_index_upload(upload_queue)?; + + Some(self.schedule_barrier0(upload_queue)) + } }; - Self::wait_completion0(receiver).await + if let Some(receiver) = receiver { + Self::wait_completion0(receiver).await?; + } + Ok(()) } /// Schedules uploading a new version of `index_part.json` with the given layers added, @@ -715,24 +787,142 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - upload_queue.dirty.metadata.detach_from_ancestor(&adopted); - upload_queue.dirty.lineage.record_detaching(&adopted); + if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) { + None + } else { + upload_queue.dirty.metadata.detach_from_ancestor(&adopted); + upload_queue.dirty.lineage.record_detaching(&adopted); - for layer in layers { - upload_queue - .dirty - .layer_metadata - .insert(layer.layer_desc().layer_name(), layer.metadata()); + for layer in layers { + let prev = upload_queue + .dirty + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + assert!(prev.is_none(), "copied layer existed already {layer}"); + } + + self.schedule_index_upload(upload_queue)?; + + Some(self.schedule_barrier0(upload_queue)) } - - self.schedule_index_upload(upload_queue)?; - - let barrier = self.schedule_barrier0(upload_queue); - self.launch_queued_tasks(upload_queue); - barrier }; - Self::wait_completion0(barrier).await + if let Some(barrier) = barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) + } + + /// Adds a gc blocking reason for this timeline if one does not exist already. + /// + /// A retryable step of timeline detach ancestor. + /// + /// Returns a future which waits until the completion of the upload. + pub(crate) fn schedule_insert_gc_block_reason( + self: &Arc, + reason: index::GcBlockingReason, + ) -> Result>, NotInitialized> + { + let maybe_barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if let index::GcBlockingReason::DetachAncestor = reason { + if upload_queue.dirty.metadata.ancestor_timeline().is_none() { + drop(guard); + panic!("cannot start detach ancestor if there is nothing to detach from"); + } + } + + let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason)); + + let current = upload_queue.dirty.gc_blocking.as_ref(); + let uploaded = upload_queue.clean.0.gc_blocking.as_ref(); + + match (current, uploaded) { + (x, y) if wanted(x) && wanted(y) => None, + (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), + // Usual case: !wanted(x) && !wanted(y) + // + // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to + // turn on and off some reason. + (x, y) => { + if !wanted(x) && wanted(y) { + // this could be avoided by having external in-memory synchronization, like + // timeline detach ancestor + warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason"); + } + + // at this point, the metadata must always show that there is a parent + upload_queue.dirty.gc_blocking = current + .map(|x| x.with_reason(reason)) + .or_else(|| Some(index::GcBlocking::started_now_for(reason))); + self.schedule_index_upload(upload_queue)?; + Some(self.schedule_barrier0(upload_queue)) + } + } + }; + + Ok(async move { + if let Some(barrier) = maybe_barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) + }) + } + + /// Removes a gc blocking reason for this timeline if one exists. + /// + /// A retryable step of timeline detach ancestor. + /// + /// Returns a future which waits until the completion of the upload. + pub(crate) fn schedule_remove_gc_block_reason( + self: &Arc, + reason: index::GcBlockingReason, + ) -> Result>, NotInitialized> + { + let maybe_barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if let index::GcBlockingReason::DetachAncestor = reason { + if !upload_queue.clean.0.lineage.is_detached_from_ancestor() { + drop(guard); + panic!("cannot complete timeline_ancestor_detach while not detached"); + } + } + + let wanted = |x: Option<&index::GcBlocking>| { + x.is_none() || x.is_some_and(|b| !b.blocked_by(reason)) + }; + + let current = upload_queue.dirty.gc_blocking.as_ref(); + let uploaded = upload_queue.clean.0.gc_blocking.as_ref(); + + match (current, uploaded) { + (x, y) if wanted(x) && wanted(y) => None, + (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), + (x, y) => { + if !wanted(x) && wanted(y) { + warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)"); + } + + upload_queue.dirty.gc_blocking = + current.as_ref().and_then(|x| x.without_reason(reason)); + assert!(wanted(upload_queue.dirty.gc_blocking.as_ref())); + // FIXME: bogus ? + self.schedule_index_upload(upload_queue)?; + Some(self.schedule_barrier0(upload_queue)) + } + } + }; + + Ok(async move { + if let Some(barrier) = maybe_barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) + }) } /// Launch an upload operation in the background; the file is added to be included in next @@ -740,7 +930,7 @@ impl RemoteTimelineClient { pub(crate) fn schedule_layer_file_upload( self: &Arc, layer: ResidentLayer, - ) -> anyhow::Result<()> { + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -803,7 +993,10 @@ impl RemoteTimelineClient { /// /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`] /// is invoked on them. - pub(crate) fn schedule_gc_update(self: &Arc, gc_layers: &[Layer]) -> anyhow::Result<()> { + pub(crate) fn schedule_gc_update( + self: &Arc, + gc_layers: &[Layer], + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -826,7 +1019,7 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> anyhow::Result> + ) -> Result, NotInitialized> where I: IntoIterator, { @@ -952,7 +1145,7 @@ impl RemoteTimelineClient { self: &Arc, compacted_from: &[Layer], compacted_to: &[ResidentLayer], - ) -> anyhow::Result<()> { + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -969,10 +1162,12 @@ impl RemoteTimelineClient { } /// Wait for all previously scheduled uploads/deletions to complete - pub(crate) async fn wait_completion(self: &Arc) -> anyhow::Result<()> { + pub(crate) async fn wait_completion(self: &Arc) -> Result<(), WaitCompletionError> { let receiver = { let mut guard = self.upload_queue.lock().unwrap(); - let upload_queue = guard.initialized_mut()?; + let upload_queue = guard + .initialized_mut() + .map_err(WaitCompletionError::NotInitialized)?; self.schedule_barrier0(upload_queue) }; @@ -981,9 +1176,9 @@ impl RemoteTimelineClient { async fn wait_completion0( mut receiver: tokio::sync::watch::Receiver<()>, - ) -> anyhow::Result<()> { + ) -> Result<(), WaitCompletionError> { if receiver.changed().await.is_err() { - anyhow::bail!("wait_completion aborted because upload queue was stopped"); + return Err(WaitCompletionError::UploadQueueShutDownOrStopped); } Ok(()) @@ -1311,6 +1506,18 @@ impl RemoteTimelineClient { .dirty .layer_metadata .drain() + .filter(|(_file_name, meta)| { + // Filter out layers that belonged to an ancestor shard. Since we are deleting the whole timeline from + // all shards anyway, we _could_ delete these, but + // - it creates a potential race if other shards are still + // using the layers while this shard deletes them. + // - it means that if we rolled back the shard split, the ancestor shards would be in a state where + // these timelines are present but corrupt (their index exists but some layers don't) + // + // These layers will eventually be cleaned up by the scrubber when it does physical GC. + meta.shard.shard_number == self.tenant_shard_id.shard_number + && meta.shard.shard_count == self.tenant_shard_id.shard_count + }) .map(|(file_name, meta)| { remote_layer_path( &self.tenant_shard_id.tenant_id, @@ -1366,12 +1573,13 @@ impl RemoteTimelineClient { // marker via its deleted_at attribute let latest_index = remaining .iter() - .filter(|p| { - p.object_name() + .filter(|o| { + o.key + .object_name() .map(|n| n.starts_with(IndexPart::FILE_NAME)) .unwrap_or(false) }) - .filter_map(|path| parse_remote_index_path(path.clone()).map(|gen| (path, gen))) + .filter_map(|o| parse_remote_index_path(o.key.clone()).map(|gen| (o.key.clone(), gen))) .max_by_key(|i| i.1) .map(|i| i.0.clone()) .unwrap_or( @@ -1382,14 +1590,12 @@ impl RemoteTimelineClient { let remaining_layers: Vec = remaining .into_iter() - .filter(|p| { - if p == &latest_index { - return false; + .filter_map(|o| { + if o.key == latest_index || o.key.object_name() == Some(INITDB_PRESERVED_PATH) { + None + } else { + Some(o.key) } - if p.object_name() == Some(INITDB_PRESERVED_PATH) { - return false; - } - true }) .inspect(|path| { if let Some(name) = path.object_name() { @@ -1522,10 +1728,9 @@ impl RemoteTimelineClient { task_mgr::spawn( &self.runtime, TaskKind::RemoteUploadTask, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), "remote upload", - false, async move { self_rc.perform_upload_task(task).await; Ok(()) @@ -1930,6 +2135,31 @@ impl RemoteTimelineClient { } } } + + /// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue + /// externally to RemoteTimelineClient. + pub(crate) fn initialized_upload_queue( + &self, + ) -> Result, NotInitialized> { + let mut inner = self.upload_queue.lock().unwrap(); + inner.initialized_mut()?; + Ok(UploadQueueAccessor { inner }) + } +} + +pub(crate) struct UploadQueueAccessor<'a> { + inner: std::sync::MutexGuard<'a, UploadQueue>, +} + +impl<'a> UploadQueueAccessor<'a> { + pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart { + match &*self.inner { + UploadQueue::Initialized(x) => &x.clean.0, + UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { + unreachable!("checked before constructing") + } + } + } } pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath { @@ -2103,7 +2333,7 @@ mod tests { impl TestSetup { async fn new(test_name: &str) -> anyhow::Result { let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}"))); - let harness = TenantHarness::create(test_name)?; + let harness = TenantHarness::create(test_name).await?; let (tenant, ctx) = harness.load().await; let timeline = tenant diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d0385e4aee..d9725ad756 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -23,6 +23,8 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; +#[cfg_attr(target_os = "macos", allow(unused_imports))] +use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath}; @@ -219,9 +221,7 @@ async fn download_object<'a>( Ok(chunk) => chunk, Err(e) => return Err(e), }; - buffered - .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx) - .await?; + buffered.write_buffered(chunk.slice_len(), ctx).await?; } let size_tracking = buffered.flush_and_into_inner(ctx).await?; Ok(size_tracking.into_inner()) @@ -295,10 +295,11 @@ where }; } - for key in listing.keys { - let object_name = key + for object in listing.keys { + let object_name = object + .key .object_name() - .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?; + .ok_or_else(|| anyhow::anyhow!("object name for key {}", object.key))?; other_prefixes.insert(object_name.to_string()); } @@ -459,7 +460,7 @@ pub(crate) async fn download_index_part( // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md let max_previous_generation = indices .into_iter() - .filter_map(parse_remote_index_path) + .filter_map(|o| parse_remote_index_path(o.key)) .filter(|g| g <= &my_generation) .max(); diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 6233a3477e..757fb9d032 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -32,6 +32,10 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none")] pub deleted_at: Option, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub archived_at: Option, + /// Per layer file name metadata, which can be present for a present or missing layer file. /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata @@ -56,6 +60,9 @@ pub struct IndexPart { #[serde(default)] pub(crate) lineage: Lineage, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) gc_blocking: Option, + /// Describes the kind of aux files stored in the timeline. /// /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable. @@ -80,10 +87,12 @@ impl IndexPart { /// - 5: lineage was added /// - 6: last_aux_file_policy is added. /// - 7: metadata_bytes is no longer written, but still read - const LATEST_VERSION: usize = 7; + /// - 8: added `archived_at` + /// - 9: +gc_blocking + const LATEST_VERSION: usize = 9; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -94,7 +103,9 @@ impl IndexPart { disk_consistent_lsn: metadata.disk_consistent_lsn(), metadata, deleted_at: None, + archived_at: None, lineage: Default::default(), + gc_blocking: None, last_aux_file_policy: None, } } @@ -176,6 +187,24 @@ pub(crate) struct Lineage { /// /// If you are adding support for detaching from a hierarchy, consider changing the ancestry /// into a `Vec<(TimelineId, Lsn)>` to be a path instead. + // FIXME: this is insufficient even for path of two timelines for future wal recovery + // purposes: + // + // assuming a "old main" which has received most of the WAL, and has a branch "new main", + // starting a bit before "old main" last_record_lsn. the current version works fine, + // because we will know to replay wal and branch at the recorded Lsn to do wal recovery. + // + // then assuming "new main" would similarly receive a branch right before its last_record_lsn, + // "new new main". the current implementation would just store ("new main", ancestor_lsn, _) + // here. however, we cannot recover from WAL using only that information, we would need the + // whole ancestry here: + // + // ```json + // [ + // ["old main", ancestor_lsn("new main"), _], + // ["new main", ancestor_lsn("new new main"), _] + // ] + // ``` #[serde(skip_serializing_if = "Option::is_none", default)] original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>, } @@ -187,26 +216,47 @@ fn is_false(b: &bool) -> bool { impl Lineage { const REMEMBER_AT_MOST: usize = 100; - pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) { + pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool { if self.reparenting_history.last() == Some(old_ancestor) { // do not re-record it - return; - } + false + } else { + #[cfg(feature = "testing")] + { + let existing = self + .reparenting_history + .iter() + .position(|x| x == old_ancestor); + assert_eq!( + existing, None, + "we cannot reparent onto and off and onto the same timeline twice" + ); + } + let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; - let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; - - self.reparenting_history_truncated |= drop_oldest; - if drop_oldest { - self.reparenting_history.remove(0); + self.reparenting_history_truncated |= drop_oldest; + if drop_oldest { + self.reparenting_history.remove(0); + } + self.reparenting_history.push(*old_ancestor); + true } - self.reparenting_history.push(*old_ancestor); } - pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) { - assert!(self.original_ancestor.is_none()); - - self.original_ancestor = - Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + /// Returns true if anything changed. + pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool { + if let Some((id, lsn, _)) = self.original_ancestor { + assert_eq!( + &(id, lsn), + branchpoint, + "detaching attempt has to be for the same ancestor we are already detached from" + ); + false + } else { + self.original_ancestor = + Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + true + } } /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed @@ -217,6 +267,78 @@ impl Lineage { self.original_ancestor .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn) } + + /// Returns true if the timeline originally had an ancestor, and no longer has one. + pub(crate) fn is_detached_from_ancestor(&self) -> bool { + self.original_ancestor.is_some() + } + + /// Returns original ancestor timeline id and lsn that this timeline has been detached from. + pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> { + self.original_ancestor.map(|(id, lsn, _)| (id, lsn)) + } + + pub(crate) fn is_reparented(&self) -> bool { + !self.reparenting_history.is_empty() + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) struct GcBlocking { + pub(crate) started_at: NaiveDateTime, + pub(crate) reasons: enumset::EnumSet, +} + +#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)] +#[enumset(serialize_repr = "list")] +pub(crate) enum GcBlockingReason { + Manual, + DetachAncestor, +} + +impl GcBlocking { + pub(super) fn started_now_for(reason: GcBlockingReason) -> Self { + GcBlocking { + started_at: chrono::Utc::now().naive_utc(), + reasons: enumset::EnumSet::only(reason), + } + } + + /// Returns true if the given reason is one of the reasons why the gc is blocked. + pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool { + self.reasons.contains(reason) + } + + /// Returns a version of self with the given reason. + pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self { + assert!(!self.blocked_by(reason)); + let mut reasons = self.reasons; + reasons.insert(reason); + + Self { + started_at: self.started_at, + reasons, + } + } + + /// Returns a version of self without the given reason. Assumption is that if + /// there are no more reasons, we can unblock the gc by returning `None`. + pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option { + assert!(self.blocked_by(reason)); + + if self.reasons.len() == 1 { + None + } else { + let mut reasons = self.reasons; + assert!(reasons.remove(reason)); + assert!(!reasons.is_empty()); + + Some(Self { + started_at: self.started_at, + reasons, + }) + } + } } #[cfg(test)] @@ -258,7 +380,9 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -300,7 +424,9 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -343,7 +469,9 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -389,7 +517,9 @@ mod tests { ]) .unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -430,7 +560,9 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -470,11 +602,13 @@ mod tests { disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(), metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage { reparenting_history_truncated: false, reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), }, + gc_blocking: None, last_aux_file_policy: None, }; @@ -519,11 +653,13 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, lineage: Lineage { reparenting_history_truncated: false, reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), }, + gc_blocking: None, last_aux_file_policy: Some(AuxFilePolicy::V2), }; @@ -577,7 +713,9 @@ mod tests { 14, ).with_recalculated_checksum().unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, lineage: Default::default(), + gc_blocking: None, last_aux_file_policy: Default::default(), }; @@ -585,6 +723,125 @@ mod tests { assert_eq!(part, expected); } + #[test] + fn v8_indexpart_is_parsed() { + let example = r#"{ + "version": 8, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "deleted_at": "2023-07-31T09:00:00.123", + "archived_at": "2023-04-29T09:00:00.123" + }"#; + + let expected = IndexPart { + version: 8, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")), + lineage: Default::default(), + gc_blocking: None, + last_aux_file_policy: Default::default(), + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v9_indexpart_is_parsed() { + let example = r#"{ + "version": 9, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + } + }"#; + + let expected = IndexPart { + version: 9, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + fn parse_naive_datetime(s: &str) -> NaiveDateTime { chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() } diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index af6840f525..1331c07d05 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -8,6 +8,7 @@ use std::{sync::Arc, time::SystemTime}; use crate::{ context::RequestContext, disk_usage_eviction_task::DiskUsageEvictionInfo, + metrics::SECONDARY_HEATMAP_TOTAL_SIZE, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, }; @@ -23,12 +24,15 @@ use super::{ storage_layer::LayerName, }; +use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE; +use metrics::UIntGauge; use pageserver_api::{ models, shard::{ShardIdentity, TenantShardId}, }; use remote_storage::GenericRemoteStorage; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::instrument; use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate}; @@ -99,6 +103,21 @@ pub(crate) struct SecondaryTenant { // Public state indicating overall progress of downloads relative to the last heatmap seen pub(crate) progress: std::sync::Mutex, + + // Sum of layer sizes on local disk + pub(super) resident_size_metric: UIntGauge, + + // Sum of layer sizes in the most recently downloaded heatmap + pub(super) heatmap_total_size_metric: UIntGauge, +} + +impl Drop for SecondaryTenant { + fn drop(&mut self) { + let tenant_id = self.tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); + let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); + let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); + } } impl SecondaryTenant { @@ -108,6 +127,16 @@ impl SecondaryTenant { tenant_conf: TenantConfOpt, config: &SecondaryLocationConfig, ) -> Arc { + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", tenant_shard_id.shard_slug()); + let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id]) + .unwrap(); + + let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id]) + .unwrap(); + Arc::new(Self { tenant_shard_id, // todo: shall we make this a descendent of the @@ -123,6 +152,9 @@ impl SecondaryTenant { detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())), progress: std::sync::Mutex::default(), + + resident_size_metric, + heatmap_total_size_metric, }) } @@ -211,16 +243,12 @@ impl SecondaryTenant { // have to 100% match what is on disk, because it's a best-effort warming // of the cache. let mut detail = this.detail.lock().unwrap(); - if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) { - let removed = timeline_detail.on_disk_layers.remove(&name); - - // We might race with removal of the same layer during downloads, if it was removed - // from the heatmap. If we see that the OnDiskState is gone, then no need to - // do a physical deletion or store in evicted_at. - if let Some(removed) = removed { - removed.remove_blocking(); - timeline_detail.evicted_at.insert(name, now); - } + if let Some(removed) = + detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric) + { + // We might race with removal of the same layer during downloads, so finding the layer we + // were trying to remove is optional. Only issue the disk I/O to remove it if we found it. + removed.remove_blocking(); } }) .await @@ -276,15 +304,50 @@ impl SecondaryController { } } +pub struct GlobalTasks { + cancel: CancellationToken, + uploader: JoinHandle<()>, + downloader: JoinHandle<()>, +} + +impl GlobalTasks { + /// Caller is responsible for requesting shutdown via the cancellation token that was + /// passed to [`spawn_tasks`]. + /// + /// # Panics + /// + /// This method panics if that token is not cancelled. + /// This is low-risk because we're calling this during process shutdown, so, a panic + /// will be informative but not cause undue downtime. + pub async fn wait(self) { + let Self { + cancel, + uploader, + downloader, + } = self; + assert!( + cancel.is_cancelled(), + "must cancel cancellation token, otherwise the tasks will not shut down" + ); + + let (uploader, downloader) = futures::future::join(uploader, downloader).await; + uploader.expect( + "unreachable: exit_on_panic_or_error would catch the panic and exit the process", + ); + downloader.expect( + "unreachable: exit_on_panic_or_error would catch the panic and exit the process", + ); + } +} + pub fn spawn_tasks( tenant_manager: Arc, remote_storage: GenericRemoteStorage, background_jobs_can_start: Barrier, cancel: CancellationToken, -) -> SecondaryController { +) -> (SecondaryController, GlobalTasks) { let mgr_clone = tenant_manager.clone(); let storage_clone = remote_storage.clone(); - let cancel_clone = cancel.clone(); let bg_jobs_clone = background_jobs_can_start.clone(); let (download_req_tx, download_req_rx) = @@ -292,17 +355,9 @@ pub fn spawn_tasks( let (upload_req_tx, upload_req_rx) = tokio::sync::mpsc::channel::>(16); - let downloader_task_ctx = RequestContext::new( - TaskKind::SecondaryDownloads, - crate::context::DownloadBehavior::Download, - ); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - downloader_task_ctx.task_kind(), - None, - None, + let cancel_clone = cancel.clone(); + let downloader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "secondary tenant downloads", - false, async move { downloader_task( mgr_clone, @@ -310,49 +365,41 @@ pub fn spawn_tasks( download_req_rx, bg_jobs_clone, cancel_clone, - downloader_task_ctx, + RequestContext::new( + TaskKind::SecondaryDownloads, + crate::context::DownloadBehavior::Download, + ), ) .await; - - Ok(()) + anyhow::Ok(()) }, - ); + )); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::SecondaryUploads, - None, - None, + let cancel_clone = cancel.clone(); + let uploader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "heatmap uploads", - false, async move { heatmap_uploader_task( tenant_manager, remote_storage, upload_req_rx, background_jobs_can_start, - cancel, + cancel_clone, ) .await; - - Ok(()) + anyhow::Ok(()) }, - ); + )); - SecondaryController { - download_req_tx, - upload_req_tx, - } -} - -/// For running with remote storage disabled: a SecondaryController that is connected to nothing. -pub fn null_controller() -> SecondaryController { - let (download_req_tx, _download_req_rx) = - tokio::sync::mpsc::channel::>(16); - let (upload_req_tx, _upload_req_rx) = - tokio::sync::mpsc::channel::>(16); - SecondaryController { - upload_req_tx, - download_req_tx, - } + ( + SecondaryController { + upload_req_tx, + download_req_tx, + }, + GlobalTasks { + cancel, + uploader, + downloader, + }, + ) } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 24176ecf19..90e1c01dbd 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -22,7 +22,7 @@ use crate::{ FAILED_REMOTE_OP_RETRIES, }, span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerName}, + storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint}, tasks::{warn_when_period_overrun, BackgroundLoopKind}, }, virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, @@ -46,6 +46,7 @@ use crate::tenant::{ use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; +use metrics::UIntGauge; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; use remote_storage::{DownloadError, Etag, GenericRemoteStorage}; @@ -54,7 +55,7 @@ use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; use utils::{ backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, - id::TimelineId, serde_system_time, + id::TimelineId, pausable_failpoint, serde_system_time, }; use super::{ @@ -131,16 +132,66 @@ impl OnDiskState { .or_else(fs_ext::ignore_not_found) .fatal_err("Deleting secondary layer") } + + pub(crate) fn file_size(&self) -> u64 { + self.metadata.file_size + } } #[derive(Debug, Clone, Default)] pub(super) struct SecondaryDetailTimeline { - pub(super) on_disk_layers: HashMap, + on_disk_layers: HashMap, /// We remember when layers were evicted, to prevent re-downloading them. pub(super) evicted_at: HashMap, } +impl SecondaryDetailTimeline { + pub(super) fn remove_layer( + &mut self, + name: &LayerName, + resident_metric: &UIntGauge, + ) -> Option { + let removed = self.on_disk_layers.remove(name); + if let Some(removed) = &removed { + resident_metric.sub(removed.file_size()); + } + removed + } + + /// `local_path` + fn touch_layer( + &mut self, + conf: &'static PageServerConf, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + touched: &HeatMapLayer, + resident_metric: &UIntGauge, + local_path: F, + ) where + F: FnOnce() -> Utf8PathBuf, + { + use std::collections::hash_map::Entry; + match self.on_disk_layers.entry(touched.name.clone()) { + Entry::Occupied(mut v) => { + v.get_mut().access_time = touched.access_time; + } + Entry::Vacant(e) => { + e.insert(OnDiskState::new( + conf, + tenant_shard_id, + timeline_id, + touched.name.clone(), + touched.metadata.clone(), + touched.access_time, + local_path(), + )); + resident_metric.add(touched.metadata.file_size); + } + } + } +} + // Aspects of a heatmap that we remember after downloading it #[derive(Clone, Debug)] struct DownloadSummary { @@ -158,7 +209,7 @@ pub(super) struct SecondaryDetail { last_download: Option, next_download: Option, - pub(super) timelines: HashMap, + timelines: HashMap, } /// Helper for logging SystemTime @@ -191,6 +242,38 @@ impl SecondaryDetail { } } + pub(super) fn evict_layer( + &mut self, + name: LayerName, + timeline_id: &TimelineId, + now: SystemTime, + resident_metric: &UIntGauge, + ) -> Option { + let timeline = self.timelines.get_mut(timeline_id)?; + let removed = timeline.remove_layer(&name, resident_metric); + if removed.is_some() { + timeline.evicted_at.insert(name, now); + } + removed + } + + pub(super) fn remove_timeline( + &mut self, + timeline_id: &TimelineId, + resident_metric: &UIntGauge, + ) { + let removed = self.timelines.remove(timeline_id); + if let Some(removed) = removed { + resident_metric.sub( + removed + .on_disk_layers + .values() + .map(|l| l.metadata.file_size) + .sum(), + ); + } + } + /// Additionally returns the total number of layers, used for more stable relative access time /// based eviction. pub(super) fn get_layers_for_eviction( @@ -213,6 +296,9 @@ impl SecondaryDetail { }), last_activity_ts: ods.access_time, relative_last_activity: finite_f32::FiniteF32::ZERO, + // Secondary location layers are presumed visible, because Covered layers + // are excluded from the heatmap + visibility: LayerVisibilityHint::Visible, } })); @@ -262,6 +348,7 @@ impl scheduler::RunningJob for RunningDownload { struct CompleteDownload { secondary_state: Arc, completed_at: Instant, + result: Result<(), UpdateError>, } impl scheduler::Completion for CompleteDownload { @@ -286,21 +373,33 @@ impl JobGenerator { + // Start downloading again as soon as we can. This will involve waiting for the scheduler's + // scheduling interval. This slightly reduces the peak download speed of tenants that hit their + // deadline and keep restarting, but that also helps give other tenants a chance to execute rather + // that letting one big tenant dominate for a long time. + detail.next_download = Some(Instant::now()); + } + _ => { + let period = detail + .last_download + .as_ref() + .map(|d| d.upload_period) + .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL); - // We advance next_download irrespective of errors: we don't want error cases to result in - // expensive busy-polling. - detail.next_download = Some(Instant::now() + period_jitter(period, 5)); + // We advance next_download irrespective of errors: we don't want error cases to result in + // expensive busy-polling. + detail.next_download = Some(Instant::now() + period_jitter(period, 5)); + } + } } async fn schedule(&mut self) -> SchedulingResult { @@ -396,9 +495,10 @@ impl JobGenerator { tracing::info!("No heatmap found for tenant. This is fine if it is new."); @@ -415,6 +515,9 @@ impl JobGenerator { tracing::error!("Error while downloading tenant: {e}"); }, + Err(UpdateError::Restart) => { + tracing::info!("Download reached deadline & will restart to update heatmap") + } Ok(()) => {} }; @@ -436,6 +539,7 @@ impl JobGenerator { /// Errors that may be encountered while updating a tenant #[derive(thiserror::Error, Debug)] enum UpdateError { + /// This is not a true failure, but it's how a download indicates that it would like to be restarted by + /// the scheduler, to pick up the latest heatmap + #[error("Reached deadline, restarting downloads")] + Restart, + #[error("No remote data found")] NoData, #[error("Insufficient local storage space")] @@ -578,8 +687,13 @@ impl<'a> TenantDownloader<'a> { Some(t) => t, None => { // We have no existing state: need to scan local disk for layers first. - let timeline_state = - init_timeline_state(self.conf, tenant_shard_id, timeline).await; + let timeline_state = init_timeline_state( + self.conf, + tenant_shard_id, + timeline, + &self.secondary_state.resident_size_metric, + ) + .await; // Re-acquire detail lock now that we're done with async load from local FS self.secondary_state @@ -603,6 +717,26 @@ impl<'a> TenantDownloader<'a> { self.prepare_timelines(&heatmap, heatmap_mtime).await?; } + // Calculate a deadline for downloads: if downloading takes longer than this, it is useful to drop out and start again, + // so that we are always using reasonably a fresh heatmap. Otherwise, if we had really huge content to download, we might + // spend 10s of minutes downloading layers we don't need. + // (see https://github.com/neondatabase/neon/issues/8182) + let deadline = { + let period = self + .secondary_state + .detail + .lock() + .unwrap() + .last_download + .as_ref() + .map(|d| d.upload_period) + .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL); + + // Use double the period: we are not promising to complete within the period, this is just a heuristic + // to keep using a "reasonably fresh" heatmap. + Instant::now() + period * 2 + }; + // Download the layers in the heatmap for timeline in heatmap.timelines { let timeline_state = timeline_states @@ -618,7 +752,7 @@ impl<'a> TenantDownloader<'a> { } let timeline_id = timeline.timeline_id; - self.download_timeline(timeline, timeline_state, ctx) + self.download_timeline(timeline, timeline_state, deadline, ctx) .instrument(tracing::info_span!( "secondary_download_timeline", tenant_id=%tenant_shard_id.tenant_id, @@ -628,6 +762,25 @@ impl<'a> TenantDownloader<'a> { .await?; } + // Metrics consistency check in testing builds + if cfg!(feature = "testing") { + let detail = self.secondary_state.detail.lock().unwrap(); + let resident_size = detail + .timelines + .values() + .map(|tl| { + tl.on_disk_layers + .values() + .map(|v| v.metadata.file_size) + .sum::() + }) + .sum::(); + assert_eq!( + resident_size, + self.secondary_state.resident_size_metric.get() + ); + } + // Only update last_etag after a full successful download: this way will not skip // the next download, even if the heatmap's actual etag is unchanged. self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary { @@ -676,6 +829,12 @@ impl<'a> TenantDownloader<'a> { layers_downloaded: 0, bytes_downloaded: 0, }; + + // Also expose heatmap bytes_total as a metric + self.secondary_state + .heatmap_total_size_metric + .set(heatmap_stats.bytes); + // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock let mut delete_layers = Vec::new(); let mut delete_timelines = Vec::new(); @@ -740,7 +899,7 @@ impl<'a> TenantDownloader<'a> { for delete_timeline in &delete_timelines { // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal // from disk fails that will be a fatal error. - detail.timelines.remove(delete_timeline); + detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric); } } @@ -758,7 +917,7 @@ impl<'a> TenantDownloader<'a> { let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else { continue; }; - timeline_state.on_disk_layers.remove(&layer_name); + timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric); } for timeline_id in delete_timelines { @@ -827,26 +986,28 @@ impl<'a> TenantDownloader<'a> { .and_then(|x| x) } - async fn download_timeline( + /// Download heatmap layers that are not present on local disk, or update their + /// access time if they are already present. + async fn download_timeline_layers( &self, + tenant_shard_id: &TenantShardId, timeline: HeatMapTimeline, timeline_state: SecondaryDetailTimeline, + deadline: Instant, ctx: &RequestContext, - ) -> Result<(), UpdateError> { - debug_assert_current_span_has_tenant_and_timeline_id(); - let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); - + ) -> (Result<(), UpdateError>, Vec) { // Accumulate updates to the state let mut touched = Vec::new(); - tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); - - // Download heatmap layers that are not present on local disk, or update their - // access time if they are already present. for layer in timeline.layers { if self.secondary_state.cancel.is_cancelled() { tracing::debug!("Cancelled -- dropping out of layer loop"); - return Err(UpdateError::Cancelled); + return (Err(UpdateError::Cancelled), touched); + } + + if Instant::now() > deadline { + // We've been running downloads for a while, restart to download latest heatmap. + return (Err(UpdateError::Restart), touched); } // Existing on-disk layers: just update their access time. @@ -916,52 +1077,66 @@ impl<'a> TenantDownloader<'a> { match self .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx) - .await? + .await { - Some(layer) => touched.push(layer), - None => { + Ok(Some(layer)) => touched.push(layer), + Ok(None) => { // Not an error but we didn't download it: remote layer is missing. Don't add it to the list of // things to consider touched. } - } - } - - // Write updates to state to record layers we just downloaded or touched. - { - let mut detail = self.secondary_state.detail.lock().unwrap(); - let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default(); - - tracing::info!("Wrote timeline_detail for {} touched layers", touched.len()); - - for t in touched { - use std::collections::hash_map::Entry; - match timeline_detail.on_disk_layers.entry(t.name.clone()) { - Entry::Occupied(mut v) => { - v.get_mut().access_time = t.access_time; - } - Entry::Vacant(e) => { - let local_path = local_layer_path( - self.conf, - tenant_shard_id, - &timeline.timeline_id, - &t.name, - &t.metadata.generation, - ); - e.insert(OnDiskState::new( - self.conf, - tenant_shard_id, - &timeline.timeline_id, - t.name, - t.metadata.clone(), - t.access_time, - local_path, - )); - } + Err(e) => { + return (Err(e), touched); } } } - Ok(()) + (Ok(()), touched) + } + + async fn download_timeline( + &self, + timeline: HeatMapTimeline, + timeline_state: SecondaryDetailTimeline, + deadline: Instant, + ctx: &RequestContext, + ) -> Result<(), UpdateError> { + debug_assert_current_span_has_tenant_and_timeline_id(); + let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + let timeline_id = timeline.timeline_id; + + tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); + + let (result, touched) = self + .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx) + .await; + + // Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful + { + let mut detail = self.secondary_state.detail.lock().unwrap(); + let timeline_detail = detail.timelines.entry(timeline_id).or_default(); + + tracing::info!("Wrote timeline_detail for {} touched layers", touched.len()); + touched.into_iter().for_each(|t| { + timeline_detail.touch_layer( + self.conf, + tenant_shard_id, + &timeline_id, + &t, + &self.secondary_state.resident_size_metric, + || { + local_layer_path( + self.conf, + tenant_shard_id, + &timeline_id, + &t.name, + &t.metadata.generation, + ) + }, + ) + }); + } + + result } /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics @@ -980,12 +1155,14 @@ impl<'a> TenantDownloader<'a> { layer: HeatMapLayer, ctx: &RequestContext, ) -> Result, UpdateError> { - // Failpoint for simulating slow remote storage + // Failpoints for simulating slow remote storage failpoint_support::sleep_millis_async!( "secondary-layer-download-sleep", &self.secondary_state.cancel ); + pausable_failpoint!("secondary-layer-download-pausable"); + let local_path = local_layer_path( self.conf, tenant_shard_id, @@ -1067,6 +1244,7 @@ async fn init_timeline_state( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, heatmap: &HeatMapTimeline, + resident_metric: &UIntGauge, ) -> SecondaryDetailTimeline { let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id); let mut detail = SecondaryDetailTimeline::default(); @@ -1142,17 +1320,13 @@ async fn init_timeline_state( } else { // We expect the access time to be initialized immediately afterwards, when // the latest heatmap is applied to the state. - detail.on_disk_layers.insert( - name.clone(), - OnDiskState::new( - conf, - tenant_shard_id, - &heatmap.timeline_id, - name, - remote_meta.metadata.clone(), - remote_meta.access_time, - file_path, - ), + detail.touch_layer( + conf, + tenant_shard_id, + &heatmap.timeline_id, + remote_meta, + resident_metric, + || file_path, ); } } diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 166483ba5d..4a8e66d38a 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant { #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapTimeline { #[serde_as(as = "DisplayFromStr")] - pub(super) timeline_id: TimelineId, + pub(crate) timeline_id: TimelineId, - pub(super) layers: Vec, + pub(crate) layers: Vec, } #[serde_as] #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapLayer { - pub(super) name: LayerName, - pub(super) metadata: LayerFileMetadata, + pub(crate) name: LayerName, + pub(crate) metadata: LayerFileMetadata, #[serde_as(as = "TimestampSeconds")] pub(super) access_time: SystemTime, diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index 9c7a9c4234..0aad5bf392 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -367,10 +367,9 @@ async fn upload_tenant_heatmap( debug_assert_current_span_has_tenant_id(); let generation = tenant.get_generation(); + debug_assert!(!generation.is_none()); if generation.is_none() { - // We do not expect this: generations were implemented before heatmap uploads. However, - // handle it so that we don't have to make the generation in the heatmap an Option<> - // (Generation::none is not serializable) + // We do not expect this: None generations should only appear in historic layer metadata, not in running Tenants tracing::warn!("Skipping heatmap upload for tenant with generation==None"); return Ok(UploadHeatmapOutcome::Skipped); } diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index b2338b620e..41d558d3f6 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -3,6 +3,7 @@ use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use tenant_size_model::svg::SvgBranchKind; use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; @@ -87,6 +88,9 @@ impl SegmentMeta { LsnKind::BranchPoint => true, LsnKind::GcCutOff => true, LsnKind::BranchEnd => false, + LsnKind::LeasePoint => true, + LsnKind::LeaseStart => false, + LsnKind::LeaseEnd => false, } } } @@ -103,6 +107,21 @@ pub enum LsnKind { GcCutOff, /// Last record LSN BranchEnd, + /// A LSN lease is granted here. + LeasePoint, + /// A lease starts from here. + LeaseStart, + /// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]). + LeaseEnd, +} + +impl From for SvgBranchKind { + fn from(kind: LsnKind) -> Self { + match kind { + LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease, + _ => SvgBranchKind::Timeline, + } + } } /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as @@ -116,19 +135,20 @@ pub struct TimelineInputs { ancestor_lsn: Lsn, last_record: Lsn, latest_gc_cutoff: Lsn, - horizon_cutoff: Lsn, - pitr_cutoff: Lsn, /// Cutoff point based on GC settings - next_gc_cutoff: Lsn, + next_pitr_cutoff: Lsn, /// Cutoff point calculated from the user-supplied 'max_retention_period' retention_param_cutoff: Option, + + /// Lease points on the timeline + lease_points: Vec, } /// Gathers the inputs for the tenant sizing model. /// -/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which +/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which /// is updated on-demand, during the start of this calculation and separate from the /// [`TimelineInputs::latest_gc_cutoff`]. /// @@ -136,11 +156,8 @@ pub struct TimelineInputs { /// /// ```text /// 0-----|---------|----|------------| · · · · · |·> lsn -/// initdb_lsn branchpoints* next_gc_cutoff latest +/// initdb_lsn branchpoints* next_pitr_cutoff latest /// ``` -/// -/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the -/// tenant size will be zero. pub(super) async fn gather_inputs( tenant: &Tenant, limit: &Arc, @@ -150,7 +167,7 @@ pub(super) async fn gather_inputs( cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { - // refresh is needed to update gc related pitr_cutoff and horizon_cutoff + // refresh is needed to update [`timeline::GcCutoffs`] tenant.refresh_gc_info(cancel, ctx).await?; // Collect information about all the timelines @@ -214,27 +231,32 @@ pub(super) async fn gather_inputs( // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not // actually removing files. // - // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from + // We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather - // than a space bound (horizon cutoff). This means that if someone drops a database and waits for their + // than our internal space cutoff. This means that if someone drops a database and waits for their // PITR interval, they will see synthetic size decrease, even if we are still storing data inside - // horizon_cutoff. - let pitr_cutoff = gc_info.cutoffs.pitr; - let horizon_cutoff = gc_info.cutoffs.horizon; - let mut next_gc_cutoff = pitr_cutoff; + // the space cutoff. + let mut next_pitr_cutoff = gc_info.cutoffs.time; // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period)); - if next_gc_cutoff < param_cutoff { - next_gc_cutoff = param_cutoff; + if next_pitr_cutoff < param_cutoff { + next_pitr_cutoff = param_cutoff; } Some(param_cutoff) } else { None }; - // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we + let lease_points = gc_info + .leases + .keys() + .filter(|&&lsn| lsn > ancestor_lsn) + .copied() + .collect::>(); + + // next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we // want to query any logical size before initdb_lsn. let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn); @@ -242,12 +264,14 @@ pub(super) async fn gather_inputs( let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() - .filter(|&&lsn| lsn > ancestor_lsn) + .filter(|(lsn, _child_id)| lsn > &ancestor_lsn) .copied() // this assumes there are no other retain_lsns than the branchpoints - .map(|lsn| (lsn, LsnKind::BranchPoint)) + .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint)) .collect::>(); + lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); + drop(gc_info); // Add branch points we collected earlier, just in case there were any that were @@ -260,10 +284,10 @@ pub(super) async fn gather_inputs( ) } - // Add a point for the GC cutoff - let branch_start_needed = next_gc_cutoff <= branch_start_lsn; + // Add a point for the PITR cutoff + let branch_start_needed = next_pitr_cutoff <= branch_start_lsn; if !branch_start_needed { - lsns.push((next_gc_cutoff, LsnKind::GcCutOff)); + lsns.push((next_pitr_cutoff, LsnKind::GcCutOff)); } lsns.sort_unstable(); @@ -296,17 +320,56 @@ pub(super) async fn gather_inputs( if kind == LsnKind::BranchPoint { branchpoint_segments.insert((timeline_id, lsn), segments.len()); } + segments.push(SegmentMeta { segment: Segment { parent: Some(parent), lsn: lsn.0, size: None, - needed: lsn > next_gc_cutoff, + needed: lsn > next_pitr_cutoff, }, timeline_id: timeline.timeline_id, kind, }); - parent += 1; + + parent = segments.len() - 1; + + if kind == LsnKind::LeasePoint { + // Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data + // (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN + // value. Without the other two segments, the calculation code would not count the leased LSN as a point + // to be retained. + // Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug. + // + // Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and + // branch points can be given a synthetic id so we can unite them. + let mut lease_parent = parent; + + // Start of a lease. + segments.push(SegmentMeta { + segment: Segment { + parent: Some(lease_parent), + lsn: lsn.0, + size: None, // Filled in later, if necessary + needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention. + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::LeaseStart, + }); + lease_parent += 1; + + // End of the lease. + segments.push(SegmentMeta { + segment: Segment { + parent: Some(lease_parent), + lsn: lsn.0, + size: None, // Filled in later, if necessary + needed: true, // everything at the lease LSN must be readable => is needed + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::LeaseEnd, + }); + } } // Current end of the timeline @@ -328,10 +391,9 @@ pub(super) async fn gather_inputs( last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), - horizon_cutoff, - pitr_cutoff, - next_gc_cutoff, + next_pitr_cutoff, retention_param_cutoff, + lease_points, }); } @@ -671,30 +733,27 @@ fn verify_size_for_multiple_branches() { "ancestor_lsn": "0/18D3D98", "last_record": "0/2230CD0", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/2210CD0", - "pitr_cutoff": "0/2210CD0", - "next_gc_cutoff": "0/2210CD0", - "retention_param_cutoff": null + "next_pitr_cutoff": "0/2210CD0", + "retention_param_cutoff": null, + "lease_points": [] }, { "timeline_id": "454626700469f0a9914949b9d018e876", "ancestor_lsn": "0/176D998", "last_record": "0/1837770", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/1817770", - "pitr_cutoff": "0/1817770", - "next_gc_cutoff": "0/1817770", - "retention_param_cutoff": null + "next_pitr_cutoff": "0/1817770", + "retention_param_cutoff": null, + "lease_points": [] }, { "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", "ancestor_lsn": "0/0", "last_record": "0/18D3D98", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/18B3D98", - "pitr_cutoff": "0/18B3D98", - "next_gc_cutoff": "0/18B3D98", - "retention_param_cutoff": null + "next_pitr_cutoff": "0/18B3D98", + "retention_param_cutoff": null, + "lease_points": [] } ] } @@ -746,10 +805,9 @@ fn verify_size_for_one_branch() { "ancestor_lsn": "0/0", "last_record": "47/280A5860", "latest_gc_cutoff": "47/240A5860", - "horizon_cutoff": "47/240A5860", - "pitr_cutoff": "47/240A5860", - "next_gc_cutoff": "47/240A5860", - "retention_param_cutoff": "0/0" + "next_pitr_cutoff": "47/240A5860", + "retention_param_cutoff": "0/0", + "lease_points": [] } ] }"#; diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9607546ce0..133b34b8b5 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -2,36 +2,29 @@ pub mod delta_layer; pub mod image_layer; -pub(crate) mod inmemory_layer; +pub mod inmemory_layer; pub(crate) mod layer; mod layer_desc; mod layer_name; +pub mod merge_iterator; + +#[cfg(test)] +pub mod split_writer; use crate::context::{AccessStatsBehavior, RequestContext}; use crate::repository::Value; -use crate::task_mgr::TaskKind; use crate::walrecord::NeonWalRecord; use bytes::Bytes; -use enum_map::EnumMap; -use enumset::EnumSet; -use once_cell::sync::Lazy; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; -use pageserver_api::models::{ - LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, -}; -use std::borrow::Cow; use std::cmp::{Ordering, Reverse}; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; use std::ops::Range; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use tracing::warn; -use utils::history_buffer::HistoryBufferWithDropCounter; -use utils::rate_limit::RateLimit; -use utils::{id::TimelineId, lsn::Lsn}; +use utils::lsn::Lsn; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; pub use image_layer::{ImageLayer, ImageLayerWriter}; @@ -74,9 +67,9 @@ where /// call, to collect more records. /// #[derive(Debug, Default)] -pub struct ValueReconstructState { - pub records: Vec<(Lsn, NeonWalRecord)>, - pub img: Option<(Lsn, Bytes)>, +pub(crate) struct ValueReconstructState { + pub(crate) records: Vec<(Lsn, NeonWalRecord)>, + pub(crate) img: Option<(Lsn, Bytes)>, } #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] @@ -442,109 +435,86 @@ impl ReadableLayer { } } -/// Return value from [`Layer::get_value_reconstruct_data`] -#[derive(Clone, Copy, Debug)] -pub enum ValueReconstructResult { - /// Got all the data needed to reconstruct the requested page - Complete, - /// This layer didn't contain all the required data, the caller should look up - /// the predecessor layer at the returned LSN and collect more data from there. - Continue, - - /// This layer didn't contain data needed to reconstruct the page version at - /// the returned LSN. This is usually considered an error, but might be OK - /// in some circumstances. - Missing, +/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather +/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility +/// of layers (for example when creating a branch that makes some previously covered layers visible). It should +/// be used for cache management but not for correctness-critical checks. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LayerVisibilityHint { + /// A Visible layer might be read while serving a read, because there is not an image layer between it + /// and a readable LSN (the tip of the branch or a child's branch point) + Visible, + /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates + /// a branch or ephemeral endpoint at an LSN below the layer that covers this. + Covered, } -#[derive(Debug)] -pub struct LayerAccessStats(Mutex); - -/// This struct holds two instances of [`LayerAccessStatsInner`]. -/// Accesses are recorded to both instances. -/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`]. -/// The `for_eviction_policy` is never reset. -#[derive(Debug, Default, Clone)] -struct LayerAccessStatsLocked { - for_scraping_api: LayerAccessStatsInner, - for_eviction_policy: LayerAccessStatsInner, -} - -impl LayerAccessStatsLocked { - fn iter_mut(&mut self) -> impl Iterator { - [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter() - } -} - -#[derive(Debug, Default, Clone)] -struct LayerAccessStatsInner { - first_access: Option, - count_by_access_kind: EnumMap, - task_kind_flag: EnumSet, - last_accesses: HistoryBufferWithDropCounter, - last_residence_changes: HistoryBufferWithDropCounter, -} - -#[derive(Debug, Clone, Copy)] -pub(crate) struct LayerAccessStatFullDetails { - pub(crate) when: SystemTime, - pub(crate) task_kind: TaskKind, - pub(crate) access_kind: LayerAccessKind, -} +pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64); #[derive(Clone, Copy, strum_macros::EnumString)] -pub enum LayerAccessStatsReset { +pub(crate) enum LayerAccessStatsReset { NoReset, - JustTaskKindFlags, AllStats, } -fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 { - ts.duration_since(UNIX_EPOCH) - .expect("better to die in this unlikely case than report false stats") - .as_millis() - .try_into() - .expect("64 bits is enough for few more years") -} +impl Default for LayerAccessStats { + fn default() -> Self { + // Default value is to assume resident since creation time, and visible. + let (_mask, mut value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, SystemTime::now()); + value |= 0x1 << Self::VISIBILITY_SHIFT; -impl LayerAccessStatFullDetails { - fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails { - let Self { - when, - task_kind, - access_kind, - } = self; - pageserver_api::models::LayerAccessStatFullDetails { - when_millis_since_epoch: system_time_to_millis_since_epoch(when), - task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros - access_kind: *access_kind, - } + Self(std::sync::atomic::AtomicU64::new(value)) } } +// Efficient store of two very-low-resolution timestamps and some bits. Used for storing last access time and +// last residence change time. impl LayerAccessStats { - /// Create an empty stats object. - /// - /// The caller is responsible for recording a residence event - /// using [`record_residence_event`] before calling `latest_activity`. - /// If they don't, [`latest_activity`] will return `None`. - /// - /// [`record_residence_event`]: Self::record_residence_event - /// [`latest_activity`]: Self::latest_activity - pub(crate) fn empty_will_record_residence_event_later() -> Self { - LayerAccessStats(Mutex::default()) + // How many high bits to drop from a u32 timestamp? + // - Only storing up to a u32 timestamp will work fine until 2038 (if this code is still in use + // after that, this software has been very successful!) + // - Dropping the top bit is implicitly safe because unix timestamps are meant to be + // stored in an i32, so they never used it. + // - Dropping the next two bits is safe because this code is only running on systems in + // years >= 2024, and these bits have been 1 since 2021 + // + // Therefore we may store only 28 bits for a timestamp with one second resolution. We do + // this truncation to make space for some flags in the high bits of our u64. + const TS_DROP_HIGH_BITS: u32 = u32::count_ones(Self::TS_ONES) + 1; + const TS_MASK: u32 = 0x1f_ff_ff_ff; + const TS_ONES: u32 = 0x60_00_00_00; + + const ATIME_SHIFT: u32 = 0; + const RTIME_SHIFT: u32 = 32 - Self::TS_DROP_HIGH_BITS; + const VISIBILITY_SHIFT: u32 = 64 - 2 * Self::TS_DROP_HIGH_BITS; + + fn write_bits(&self, mask: u64, value: u64) -> u64 { + self.0 + .fetch_update( + // TODO: decide what orderings are correct + std::sync::atomic::Ordering::Relaxed, + std::sync::atomic::Ordering::Relaxed, + |v| Some((v & !mask) | (value & mask)), + ) + .expect("Inner function is infallible") } - /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status. - /// - /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. - /// - /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad - /// [`record_residence_event`]: Self::record_residence_event - pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self { - let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); - new.record_residence_event(status, LayerResidenceEventReason::LayerLoad); - new + fn to_low_res_timestamp(shift: u32, time: SystemTime) -> (u64, u64) { + // Drop the low three bits of the timestamp, for an ~8s accuracy + let timestamp = time.duration_since(UNIX_EPOCH).unwrap().as_secs() & (Self::TS_MASK as u64); + + ((Self::TS_MASK as u64) << shift, timestamp << shift) + } + + fn read_low_res_timestamp(&self, shift: u32) -> Option { + let read = self.0.load(std::sync::atomic::Ordering::Relaxed); + + let ts_bits = (read & ((Self::TS_MASK as u64) << shift)) >> shift; + if ts_bits == 0 { + None + } else { + Some(UNIX_EPOCH + Duration::from_secs(ts_bits | (Self::TS_ONES as u64))) + } } /// Record a change in layer residency. @@ -560,129 +530,125 @@ impl LayerAccessStats { /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map. /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock. /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event. - /// - pub(crate) fn record_residence_event( - &self, - status: LayerResidenceStatus, - reason: LayerResidenceEventReason, - ) { - let mut locked = self.0.lock().unwrap(); - locked.iter_mut().for_each(|inner| { - inner - .last_residence_changes - .write(LayerResidenceEvent::new(status, reason)) - }); + pub(crate) fn record_residence_event_at(&self, now: SystemTime) { + let (mask, value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, now); + self.write_bits(mask, value); } - fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) { + pub(crate) fn record_residence_event(&self) { + self.record_residence_event_at(SystemTime::now()) + } + + fn record_access_at(&self, now: SystemTime) -> bool { + let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now); + + // A layer which is accessed must be visible. + mask |= 0x1 << Self::VISIBILITY_SHIFT; + value |= 0x1 << Self::VISIBILITY_SHIFT; + + let old_bits = self.write_bits(mask, value); + !matches!( + self.decode_visibility(old_bits), + LayerVisibilityHint::Visible + ) + } + + /// Returns true if we modified the layer's visibility to set it to Visible implicitly + /// as a result of this access + pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool { if ctx.access_stats_behavior() == AccessStatsBehavior::Skip { - return; + return false; } - let this_access = LayerAccessStatFullDetails { - when: SystemTime::now(), - task_kind: ctx.task_kind(), - access_kind, - }; - - let mut locked = self.0.lock().unwrap(); - locked.iter_mut().for_each(|inner| { - inner.first_access.get_or_insert(this_access); - inner.count_by_access_kind[access_kind] += 1; - inner.task_kind_flag |= ctx.task_kind(); - inner.last_accesses.write(this_access); - }) + self.record_access_at(SystemTime::now()) } fn as_api_model( &self, reset: LayerAccessStatsReset, ) -> pageserver_api::models::LayerAccessStats { - let mut locked = self.0.lock().unwrap(); - let inner = &mut locked.for_scraping_api; - let LayerAccessStatsInner { - first_access, - count_by_access_kind, - task_kind_flag, - last_accesses, - last_residence_changes, - } = inner; let ret = pageserver_api::models::LayerAccessStats { - access_count_by_access_kind: count_by_access_kind - .iter() - .map(|(kind, count)| (kind, *count)) - .collect(), - task_kind_access_flag: task_kind_flag - .iter() - .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros - .collect(), - first: first_access.as_ref().map(|a| a.as_api_model()), - accesses_history: last_accesses.map(|m| m.as_api_model()), - residence_events_history: last_residence_changes.clone(), + access_time: self + .read_low_res_timestamp(Self::ATIME_SHIFT) + .unwrap_or(UNIX_EPOCH), + residence_time: self + .read_low_res_timestamp(Self::RTIME_SHIFT) + .unwrap_or(UNIX_EPOCH), + visible: matches!(self.visibility(), LayerVisibilityHint::Visible), }; match reset { - LayerAccessStatsReset::NoReset => (), - LayerAccessStatsReset::JustTaskKindFlags => { - inner.task_kind_flag.clear(); - } + LayerAccessStatsReset::NoReset => {} LayerAccessStatsReset::AllStats => { - *inner = LayerAccessStatsInner::default(); + self.write_bits((Self::TS_MASK as u64) << Self::ATIME_SHIFT, 0x0); + self.write_bits((Self::TS_MASK as u64) << Self::RTIME_SHIFT, 0x0); } } ret } - /// Get the latest access timestamp, falling back to latest residence event, further falling - /// back to `SystemTime::now` for a usable timestamp for eviction. - pub(crate) fn latest_activity_or_now(&self) -> SystemTime { - self.latest_activity().unwrap_or_else(SystemTime::now) + /// Get the latest access timestamp, falling back to latest residence event. The latest residence event + /// will be this Layer's construction time, if its residence hasn't changed since then. + pub(crate) fn latest_activity(&self) -> SystemTime { + if let Some(t) = self.read_low_res_timestamp(Self::ATIME_SHIFT) { + t + } else { + self.read_low_res_timestamp(Self::RTIME_SHIFT) + .expect("Residence time is set on construction") + } } - /// Get the latest access timestamp, falling back to latest residence event. + /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]). /// - /// This function can only return `None` if there has not yet been a call to the - /// [`record_residence_event`] method. That would generally be considered an - /// implementation error. This function logs a rate-limited warning in that case. - /// - /// TODO: use type system to avoid the need for `fallback`. - /// The approach in - /// could be used to enforce that a residence event is recorded - /// before a layer is added to the layer map. We could also have - /// a layer wrapper type that holds the LayerAccessStats, and ensure - /// that that type can only be produced by inserting into the layer map. - /// - /// [`record_residence_event`]: Self::record_residence_event - fn latest_activity(&self) -> Option { - let locked = self.0.lock().unwrap(); - let inner = &locked.for_eviction_policy; - match inner.last_accesses.recent() { - Some(a) => Some(a.when), - None => match inner.last_residence_changes.recent() { - Some(e) => Some(e.timestamp), - None => { - static WARN_RATE_LIMIT: Lazy> = - Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10))))); - let mut guard = WARN_RATE_LIMIT.lock().unwrap(); - guard.0 += 1; - let occurences = guard.0; - guard.1.call(move || { - warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value"); - }); - None - } - }, + /// This indicates whether the layer has been used for some purpose that would motivate + /// us to keep it on disk, such as for serving a getpage request. + fn accessed(&self) -> bool { + // Consider it accessed if the most recent access is more recent than + // the most recent change in residence status. + match ( + self.read_low_res_timestamp(Self::ATIME_SHIFT), + self.read_low_res_timestamp(Self::RTIME_SHIFT), + ) { + (None, _) => false, + (Some(_), None) => true, + (Some(a), Some(r)) => a >= r, } } + + /// Helper for extracting the visibility hint from the literal value of our inner u64 + fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint { + match (bits >> Self::VISIBILITY_SHIFT) & 0x1 { + 1 => LayerVisibilityHint::Visible, + 0 => LayerVisibilityHint::Covered, + _ => unreachable!(), + } + } + + /// Returns the old value which has been replaced + pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint { + let value = match visibility { + LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT, + LayerVisibilityHint::Covered => 0x0, + }; + + let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value); + self.decode_visibility(old_bits) + } + + pub(crate) fn visibility(&self) -> LayerVisibilityHint { + let read = self.0.load(std::sync::atomic::Ordering::Relaxed); + self.decode_visibility(read) + } } /// Get a layer descriptor from a layer. -pub trait AsLayerDesc { +pub(crate) trait AsLayerDesc { /// Get the layer descriptor. fn layer_desc(&self) -> &PersistentLayerDesc; } pub mod tests { use pageserver_api::shard::TenantShardId; + use utils::id::TimelineId; use super::*; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 5e01ecd71d..6c2391d72d 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -33,13 +33,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader}; -use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use crate::tenant::disk_btree::{ + DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, +}; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, + BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + VectoredReadPlanner, }; -use crate::tenant::{PageReconstructError, Timeline}; +use crate::tenant::PageReconstructError; +use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::{self, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; @@ -49,10 +52,11 @@ use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::LayerAccessKind; +use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; +use std::collections::VecDeque; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; @@ -60,6 +64,7 @@ use std::os::unix::fs::FileExt; use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; +use tokio_epoll_uring::IoBufMut; use tracing::*; use utils::{ @@ -68,10 +73,7 @@ use utils::{ lsn::Lsn, }; -use super::{ - AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer, - ValuesReconstructState, -}; +use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; /// /// Header stored in the beginning of the file @@ -196,7 +198,6 @@ impl DeltaKey { pub struct DeltaLayer { path: Utf8PathBuf, pub desc: PersistentLayerDesc, - access_stats: LayerAccessStats, inner: OnceCell>, } @@ -223,6 +224,11 @@ pub struct DeltaLayerInner { file: VirtualFile, file_id: FileId, + #[allow(dead_code)] + layer_key_range: Range, + #[allow(dead_code)] + layer_lsn_range: Range, + max_vectored_read_bytes: Option, } @@ -256,7 +262,7 @@ impl DeltaLayer { return Ok(()); } - let inner = self.load(LayerAccessKind::Dump, ctx).await?; + let inner = self.load(ctx).await?; inner.dump(ctx).await } @@ -289,12 +295,7 @@ impl DeltaLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - async fn load( - &self, - access_kind: LayerAccessKind, - ctx: &RequestContext, - ) -> Result<&Arc> { - self.access_stats.record_access(access_kind, ctx); + async fn load(&self, ctx: &RequestContext) -> Result<&Arc> { // Quick exit if already loaded self.inner .get_or_try_init(|| self.load_inner(ctx)) @@ -302,12 +303,10 @@ impl DeltaLayer { .with_context(|| format!("Failed to load delta layer {}", self.path())) } - async fn load_inner(&self, ctx: &RequestContext) -> Result> { + async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result> { let path = self.path(); - let loaded = DeltaLayerInner::load(&path, None, None, ctx) - .await - .and_then(|res| res)?; + let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?; // not production code let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); @@ -347,7 +346,6 @@ impl DeltaLayer { summary.lsn_range, metadata.len(), ), - access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: OnceCell::new(), }) } @@ -370,7 +368,6 @@ impl DeltaLayer { /// 3. Call `finish`. /// struct DeltaLayerWriterInner { - conf: &'static PageServerConf, pub path: Utf8PathBuf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -381,6 +378,9 @@ struct DeltaLayerWriterInner { tree: DiskBtreeBuilder, blob_writer: BlobWriter, + + // Number of key-lsns in the layer. + num_keys: usize, } impl DeltaLayerWriterInner { @@ -414,7 +414,6 @@ impl DeltaLayerWriterInner { let tree_builder = DiskBtreeBuilder::new(block_buf); Ok(Self { - conf, path, timeline_id, tenant_shard_id, @@ -422,6 +421,7 @@ impl DeltaLayerWriterInner { lsn_range, tree: tree_builder, blob_writer, + num_keys: 0, }) } @@ -438,23 +438,42 @@ impl DeltaLayerWriterInner { ctx: &RequestContext, ) -> anyhow::Result<()> { let (_, res) = self - .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx) + .put_value_bytes( + key, + lsn, + Value::ser(&val)?.slice_len(), + val.will_init(), + ctx, + ) .await; res } - async fn put_value_bytes( + async fn put_value_bytes( &mut self, key: Key, lsn: Lsn, - val: Vec, + val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (Vec, anyhow::Result<()>) { - assert!(self.lsn_range.start <= lsn); - let (val, res) = self.blob_writer.write_blob(val, ctx).await; + ) -> (FullSlice, anyhow::Result<()>) + where + Buf: IoBufMut + Send, + { + assert!( + self.lsn_range.start <= lsn, + "lsn_start={}, lsn={}", + self.lsn_range.start, + lsn + ); + // We don't want to use compression in delta layer creation + let compression = ImageCompressionAlgorithm::Disabled; + let (val, res) = self + .blob_writer + .write_blob_maybe_compressed(val, ctx, compression) + .await; let off = match res { - Ok(off) => off, + Ok((off, _)) => off, Err(e) => return (val, Err(anyhow::anyhow!(e))), }; @@ -462,6 +481,9 @@ impl DeltaLayerWriterInner { let delta_key = DeltaKey::from_key_lsn(&key, lsn); let res = self.tree.append(&delta_key.0, blob_ref.0); + + self.num_keys += 1; + (val, res.map_err(|e| anyhow::anyhow!(e))) } @@ -475,11 +497,10 @@ impl DeltaLayerWriterInner { async fn finish( self, key_end: Key, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let temp_path = self.path.clone(); - let result = self.finish0(key_end, timeline, ctx).await; + let result = self.finish0(key_end, ctx).await; if result.is_err() { tracing::info!(%temp_path, "cleaning up temporary file after error during writing"); if let Err(e) = std::fs::remove_file(&temp_path) { @@ -492,9 +513,8 @@ impl DeltaLayerWriterInner { async fn finish0( self, key_end: Key, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -505,7 +525,7 @@ impl DeltaLayerWriterInner { file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64)) .await?; for buf in block_buf.blocks { - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; } assert!(self.lsn_range.start < self.lsn_range.end); @@ -525,7 +545,7 @@ impl DeltaLayerWriterInner { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; let metadata = file @@ -559,11 +579,9 @@ impl DeltaLayerWriterInner { // fsync the file file.sync_all().await?; - let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?; + trace!("created delta layer {}", self.path); - trace!("created delta layer {}", layer.local_path()); - - Ok(layer) + Ok((desc, self.path)) } } @@ -639,14 +657,17 @@ impl DeltaLayerWriter { .await } - pub async fn put_value_bytes( + pub async fn put_value_bytes( &mut self, key: Key, lsn: Lsn, - val: Vec, + val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (Vec, anyhow::Result<()>) { + ) -> (FullSlice, anyhow::Result<()>) + where + Buf: IoBufMut + Send, + { self.inner .as_mut() .unwrap() @@ -664,14 +685,20 @@ impl DeltaLayerWriter { pub(crate) async fn finish( mut self, key_end: Key, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { - self.inner - .take() - .unwrap() - .finish(key_end, timeline, ctx) - .await + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + self.inner.take().unwrap().finish(key_end, ctx).await + } + + #[cfg(test)] + pub(crate) fn num_keys(&self) -> usize { + self.inner.as_ref().unwrap().num_keys + } + + #[cfg(test)] + pub(crate) fn estimated_size(&self) -> u64 { + let inner = self.inner.as_ref().unwrap(); + inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64 } } @@ -730,34 +757,39 @@ impl DeltaLayer { // TODO: could use smallvec here, but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; Ok(()) } } impl DeltaLayerInner { - /// Returns nested result following Result, Critical>: - /// - inner has the success or transient failure - /// - outer has the permanent failure + pub(crate) fn key_range(&self) -> &Range { + &self.layer_key_range + } + + pub(crate) fn lsn_range(&self) -> &Range { + &self.layer_lsn_range + } + pub(super) async fn load( path: &Utf8Path, summary: Option, max_vectored_read_bytes: Option, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path, ctx).await { - Ok(file) => file, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), - }; + ) -> anyhow::Result { + let file = VirtualFile::open(path, ctx) + .await + .context("open layer file")?; + let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); - let summary_blk = match block_reader.read_blk(0, ctx).await { - Ok(blk) => blk, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), - }; + let summary_blk = block_reader + .read_blk(0, ctx) + .await + .context("read first block")?; // TODO: this should be an assertion instead; see ImageLayerInner::load let actual_summary = @@ -779,102 +811,15 @@ impl DeltaLayerInner { } } - Ok(Ok(DeltaLayerInner { + Ok(DeltaLayerInner { file, file_id, index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, max_vectored_read_bytes, - })) - } - - pub(super) async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - let mut need_image = true; - // Scan the page versions backwards, starting from `lsn`. - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - self.index_start_blk, - self.index_root_blk, - &block_reader, - ); - let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); - - let mut offsets: Vec<(Lsn, u64)> = Vec::new(); - - tree_reader - .visit( - &search_key.0, - VisitDirection::Backwards, - |key, value| { - let blob_ref = BlobRef(value); - if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { - return false; - } - let entry_lsn = DeltaKey::extract_lsn_from_buf(key); - if entry_lsn < lsn_range.start { - return false; - } - offsets.push((entry_lsn, blob_ref.pos())); - - !blob_ref.will_init() - }, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::DeltaLayerBtreeNode) - .build(), - ) - .await?; - - let ctx = &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::DeltaLayerValue) - .build(); - - // Ok, 'offsets' now contains the offsets of all the entries we need to read - let cursor = block_reader.block_cursor(); - let mut buf = Vec::new(); - for (entry_lsn, pos) in offsets { - cursor - .read_blob_into_buf(pos, &mut buf, ctx) - .await - .with_context(|| { - format!("Failed to read blob from virtual file {}", self.file.path) - })?; - let val = Value::des(&buf).with_context(|| { - format!( - "Failed to deserialize file blob from virtual file {}", - self.file.path - ) - })?; - match val { - Value::Image(img) => { - reconstruct_state.img = Some((entry_lsn, img)); - need_image = false; - break; - } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } - } - } - - // If an older page image is needed to reconstruct the page, let the - // caller know. - if need_image { - Ok(ValueReconstructResult::Continue) - } else { - Ok(ValueReconstructResult::Complete) - } + layer_key_range: actual_summary.key_range, + layer_lsn_range: actual_summary.lsn_range, + }) } // Look up the keys in the provided keyspace and update @@ -928,7 +873,6 @@ impl DeltaLayerInner { } /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. - #[cfg(test)] pub(super) async fn load_key_values( &self, ctx: &RequestContext, @@ -941,7 +885,7 @@ impl DeltaLayerInner { ); let mut result = Vec::new(); let mut stream = - Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx)); + Box::pin(self.stream_index_forwards(index_reader, &[0; DELTA_KEY_SIZE], ctx)); let block_reader = FileBlockReader::new(&self.file, self.file_id); let cursor = block_reader.block_cursor(); let mut buf = Vec::new(); @@ -976,7 +920,7 @@ impl DeltaLayerInner { ctx: &RequestContext, ) -> anyhow::Result> where - Reader: BlockReader, + Reader: BlockReader + Clone, { let ctx = RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::DeltaLayerBtreeNode) @@ -986,7 +930,7 @@ impl DeltaLayerInner { let mut range_end_handled = false; let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start); - let index_stream = index_reader.get_stream_from(&start_key.0, &ctx); + let index_stream = index_reader.clone().into_stream(&start_key.0, &ctx); let mut index_stream = std::pin::pin!(index_stream); while let Some(index_entry) = index_stream.next().await { @@ -1090,7 +1034,7 @@ impl DeltaLayerInner { for (_, blob_meta) in read.blobs_at.as_slice() { reconstruct_state.on_key_error( blob_meta.key, - PageReconstructError::from(anyhow!( + PageReconstructError::Other(anyhow!( "Failed to read blobs from virtual file {}: {}", self.file.path, kind @@ -1117,7 +1061,7 @@ impl DeltaLayerInner { Err(e) => { reconstruct_state.on_key_error( meta.meta.key, - PageReconstructError::from(anyhow!(e).context(format!( + PageReconstructError::Other(anyhow!(e).context(format!( "Failed to deserialize blob from virtual file {}", self.file.path, ))), @@ -1159,9 +1103,7 @@ impl DeltaLayerInner { let delta_key = DeltaKey::from_slice(key); let val_ref = ValueRef { blob_ref: BlobRef(value), - reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter( - Adapter(self), - )), + layer: self, }; let pos = BlobRef(value).pos(); if let Some(last) = all_keys.last_mut() { @@ -1241,7 +1183,7 @@ impl DeltaLayerInner { block_reader, ); - let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx); + let stream = self.stream_index_forwards(tree_reader, &[0u8; DELTA_KEY_SIZE], ctx); let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos)); // put in a sentinel value for getting the end offset for last item, and not having to // repeat the whole read part @@ -1363,12 +1305,12 @@ impl DeltaLayerInner { .put_value_bytes( key, lsn, - std::mem::take(&mut per_blob_copy), + std::mem::take(&mut per_blob_copy).slice_len(), will_init, ctx, ) .await; - per_blob_copy = tmp; + per_blob_copy = tmp.into_raw_slice().into_inner(); res?; @@ -1405,7 +1347,7 @@ impl DeltaLayerInner { let keys = self.load_keys(ctx).await?; async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result { - let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; + let buf = val.load_raw(ctx).await?; let val = Value::des(&buf)?; let desc = match val { Value::Image(img) => { @@ -1440,8 +1382,7 @@ impl DeltaLayerInner { use pageserver_api::key::CHECKPOINT_KEY; use postgres_ffi::CheckPoint; if key == CHECKPOINT_KEY { - let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; - let val = Value::des(&buf)?; + let val = val.load(ctx).await?; match val { Value::Image(img) => { let checkpoint = CheckPoint::decode(&img)?; @@ -1459,17 +1400,17 @@ impl DeltaLayerInner { fn stream_index_forwards<'a, R>( &'a self, - reader: &'a DiskBtreeReader, + reader: DiskBtreeReader, start: &'a [u8; DELTA_KEY_SIZE], ctx: &'a RequestContext, ) -> impl futures::stream::Stream< Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>, > + 'a where - R: BlockReader, + R: BlockReader + 'a, { use futures::stream::TryStreamExt; - let stream = reader.get_stream_from(start, ctx); + let stream = reader.into_stream(start, ctx); stream.map_ok(|(key, value)| { let key = DeltaKey::from_slice(&key); let (key, lsn) = (key.key(), key.lsn()); @@ -1493,6 +1434,23 @@ impl DeltaLayerInner { ); offset } + + pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); + DeltaLayerIterator { + delta_layer: self, + ctx, + index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx), + key_values_batch: std::collections::VecDeque::new(), + is_end: false, + planner: StreamingVectoredReadPlanner::new( + 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. + 1024, // The default value. Unit tests might use a different value + ), + } + } } /// A set of data associated with a delta layer key and its value @@ -1508,17 +1466,24 @@ pub struct DeltaEntry<'a> { /// Reference to an on-disk value pub struct ValueRef<'a> { blob_ref: BlobRef, - reader: BlockCursor<'a>, + layer: &'a DeltaLayerInner, } impl<'a> ValueRef<'a> { /// Loads the value from disk pub async fn load(&self, ctx: &RequestContext) -> Result { - // theoretically we *could* record an access time for each, but it does not really matter - let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?; + let buf = self.load_raw(ctx).await?; let val = Value::des(&buf)?; Ok(val) } + + async fn load_raw(&self, ctx: &RequestContext) -> Result> { + let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter( + self.layer, + ))); + let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?; + Ok(buf) + } } pub(crate) struct Adapter(T); @@ -1552,8 +1517,74 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del } } +pub struct DeltaLayerIterator<'a> { + delta_layer: &'a DeltaLayerInner, + ctx: &'a RequestContext, + planner: StreamingVectoredReadPlanner, + index_iter: DiskBtreeIterator<'a>, + key_values_batch: VecDeque<(Key, Lsn, Value)>, + is_end: bool, +} + +impl<'a> DeltaLayerIterator<'a> { + /// Retrieve a batch of key-value pairs into the iterator buffer. + async fn next_batch(&mut self) -> anyhow::Result<()> { + assert!(self.key_values_batch.is_empty()); + assert!(!self.is_end); + + let plan = loop { + if let Some(res) = self.index_iter.next().await { + let (raw_key, value) = res?; + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); + let blob_ref = BlobRef(value); + let offset = blob_ref.pos(); + if let Some(batch_plan) = self.planner.handle(key, lsn, offset) { + break batch_plan; + } + } else { + self.is_end = true; + let data_end_offset = self.delta_layer.index_start_offset(); + if let Some(item) = self.planner.handle_range_end(data_end_offset) { + break item; + } else { + return Ok(()); // TODO: test empty iterator + } + } + }; + let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file); + let mut next_batch = std::collections::VecDeque::new(); + let buf_size = plan.size(); + let buf = BytesMut::with_capacity(buf_size); + let blobs_buf = vectored_blob_reader + .read_blobs(&plan, buf, self.ctx) + .await?; + let frozen_buf = blobs_buf.buf.freeze(); + for meta in blobs_buf.blobs.iter() { + let value = Value::des(&frozen_buf[meta.start..meta.end])?; + next_batch.push_back((meta.meta.key, meta.meta.lsn, value)); + } + self.key_values_batch = next_batch; + Ok(()) + } + + pub async fn next(&mut self) -> anyhow::Result> { + if self.key_values_batch.is_empty() { + if self.is_end { + return Ok(None); + } + self.next_batch().await?; + } + Ok(Some( + self.key_values_batch + .pop_front() + .expect("should not be empty"), + )) + } +} + #[cfg(test)] -mod test { +pub(crate) mod test { use std::collections::BTreeMap; use itertools::MinMaxResult; @@ -1561,12 +1592,18 @@ mod test { use rand::RngCore; use super::*; + use crate::repository::Value; + use crate::tenant::harness::TIMELINE_ID; + use crate::tenant::storage_layer::{Layer, ResidentLayer}; + use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; + use crate::tenant::{Tenant, Timeline}; use crate::{ context::DownloadBehavior, task_mgr::TaskKind, tenant::{disk_btree::tests::TestDisk, harness::TenantHarness}, DEFAULT_PG_VERSION, }; + use bytes::Bytes; /// Construct an index for a fictional delta layer and and then /// traverse in order to plan vectored reads for a query. Finally, @@ -1819,7 +1856,7 @@ mod test { #[tokio::test] async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?; + let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?; let (tenant, ctx) = harness.load().await; let timeline_id = TimelineId::generate(); @@ -1848,16 +1885,15 @@ mod test { for entry in entries { let (_, res) = writer - .put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx) + .put_value_bytes(entry.key, entry.lsn, entry.value.slice_len(), false, &ctx) .await; res?; } - let resident = writer - .finish(entries_meta.key_range.end, &timeline, &ctx) - .await?; + let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?; + let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?; - let inner = resident.as_delta(&ctx).await?; + let inner = resident.get_as_delta(&ctx).await?; let file_size = inner.file.metadata().await?.len(); tracing::info!( @@ -1919,7 +1955,9 @@ mod test { use crate::walrecord::NeonWalRecord; use bytes::Bytes; - let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap(); + let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let ctx = &ctx; let timeline = tenant @@ -1933,6 +1971,7 @@ mod test { .await .likely_resident_layers() .next() + .cloned() .unwrap(); { @@ -2007,7 +2046,8 @@ mod test { .read() .await .likely_resident_layers() - .find(|x| x != &initdb_layer) + .find(|&x| x != &initdb_layer) + .cloned() .unwrap(); // create a copy for the timeline, so we don't overwrite the file @@ -2042,13 +2082,14 @@ mod test { .await .unwrap(); - let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap(); + let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap(); + let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap(); - copied_layer.as_delta(ctx).await.unwrap(); + copied_layer.get_as_delta(ctx).await.unwrap(); assert_keys_and_values_eq( - new_layer.as_delta(ctx).await.unwrap(), - copied_layer.as_delta(ctx).await.unwrap(), + new_layer.get_as_delta(ctx).await.unwrap(), + copied_layer.get_as_delta(ctx).await.unwrap(), truncate_at, ctx, ) @@ -2073,7 +2114,7 @@ mod test { source.index_root_blk, &source_reader, ); - let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx); + let source_stream = source.stream_index_forwards(source_tree, &start_key, ctx); let source_stream = source_stream.filter(|res| match res { Ok((_, lsn, _)) => ready(lsn < &truncated_at), _ => ready(true), @@ -2086,7 +2127,7 @@ mod test { truncated.index_root_blk, &truncated_reader, ); - let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx); + let truncated_stream = truncated.stream_index_forwards(truncated_tree, &start_key, ctx); let mut truncated_stream = std::pin::pin!(truncated_stream); let mut scratch_left = Vec::new(); @@ -2127,4 +2168,131 @@ mod test { assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right)); } } + + pub(crate) fn sort_delta( + (k1, l1, _): &(Key, Lsn, Value), + (k2, l2, _): &(Key, Lsn, Value), + ) -> std::cmp::Ordering { + (k1, l1).cmp(&(k2, l2)) + } + + pub(crate) fn sort_delta_value( + (k1, l1, v1): &(Key, Lsn, Value), + (k2, l2, v2): &(Key, Lsn, Value), + ) -> std::cmp::Ordering { + let order_1 = if v1.is_image() { 0 } else { 1 }; + let order_2 = if v2.is_image() { 0 } else { 1 }; + (k1, l1, order_1).cmp(&(k2, l2, order_2)) + } + + pub(crate) async fn produce_delta_layer( + tenant: &Tenant, + tline: &Arc, + mut deltas: Vec<(Key, Lsn, Value)>, + ctx: &RequestContext, + ) -> anyhow::Result { + deltas.sort_by(sort_delta); + let (key_start, _, _) = deltas.first().unwrap(); + let (key_max, _, _) = deltas.last().unwrap(); + let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap(); + let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap(); + let lsn_end = Lsn(lsn_max.0 + 1); + let mut writer = DeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + *key_start, + (*lsn_min)..lsn_end, + ctx, + ) + .await?; + let key_end = key_max.next(); + + for (key, lsn, value) in deltas { + writer.put_value(key, lsn, value, ctx).await?; + } + + let (desc, path) = writer.finish(key_end, ctx).await?; + let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?; + + Ok::<_, anyhow::Error>(delta_layer) + } + + async fn assert_delta_iter_equal( + delta_iter: &mut DeltaLayerIterator<'_>, + expect: &[(Key, Lsn, Value)], + ) { + let mut expect_iter = expect.iter(); + loop { + let o1 = delta_iter.next().await.unwrap(); + let o2 = expect_iter.next(); + assert_eq!(o1.is_some(), o2.is_some()); + if o1.is_none() && o2.is_none() { + break; + } + let (k1, l1, v1) = o1.unwrap(); + let (k2, l2, v2) = o2.unwrap(); + assert_eq!(&k1, k2); + assert_eq!(l1, *l2); + assert_eq!(&v1, v2); + } + } + + #[tokio::test] + async fn delta_layer_iterator() { + let harness = TenantHarness::create("delta_layer_iterator").await.unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + const N: usize = 1000; + let test_deltas = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10), + Lsn(0x10 * ((idx as u64) % 10 + 1)), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer = produce_delta_layer(&tenant, &tline, test_deltas.clone(), &ctx) + .await + .unwrap(); + let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap(); + for max_read_size in [1, 1024] { + for batch_size in [1, 2, 4, 8, 3, 7, 13] { + println!("running with batch_size={batch_size} max_read_size={max_read_size}"); + // Test if the batch size is correctly determined + let mut iter = delta_layer.iter(&ctx); + iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut num_items = 0; + for _ in 0..3 { + iter.next_batch().await.unwrap(); + num_items += iter.key_values_batch.len(); + if max_read_size == 1 { + // every key should be a batch b/c the value is larger than max_read_size + assert_eq!(iter.key_values_batch.len(), 1); + } else { + assert_eq!(iter.key_values_batch.len(), batch_size); + } + if num_items >= N { + break; + } + iter.key_values_batch.clear(); + } + // Test if the result is correct + let mut iter = delta_layer.iter(&ctx); + iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + assert_delta_iter_equal(&mut iter, &test_deltas).await; + } + } + } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 06e2f09384..9a19e4e2c7 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -29,15 +29,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; -use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::tenant::storage_layer::{ - LayerAccessStats, ValueReconstructResult, ValueReconstructState, +use crate::tenant::disk_btree::{ + DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, }; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, + BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + VectoredReadPlanner, }; use crate::tenant::{PageReconstructError, Timeline}; +use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::{self, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{anyhow, bail, ensure, Context, Result}; @@ -46,10 +47,10 @@ use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; +use std::collections::VecDeque; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; @@ -134,7 +135,6 @@ pub struct ImageLayer { pub desc: PersistentLayerDesc, // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn pub lsn: Lsn, - access_stats: LayerAccessStats, inner: OnceCell, } @@ -224,7 +224,7 @@ impl ImageLayer { return Ok(()); } - let inner = self.load(LayerAccessKind::Dump, ctx).await?; + let inner = self.load(ctx).await?; inner.dump(ctx).await?; @@ -251,12 +251,7 @@ impl ImageLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - async fn load( - &self, - access_kind: LayerAccessKind, - ctx: &RequestContext, - ) -> Result<&ImageLayerInner> { - self.access_stats.record_access(access_kind, ctx); + async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> { self.inner .get_or_try_init(|| self.load_inner(ctx)) .await @@ -266,9 +261,8 @@ impl ImageLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); - let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx) - .await - .and_then(|res| res)?; + let loaded = + ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?; // not production code let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); @@ -308,7 +302,6 @@ impl ImageLayer { metadata.len(), ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: summary.lsn, - access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: OnceCell::new(), }) } @@ -362,33 +355,37 @@ impl ImageLayer { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; Ok(()) } } impl ImageLayerInner { - /// Returns nested result following Result, Critical>: - /// - inner has the success or transient failure - /// - outer has the permanent failure + pub(crate) fn key_range(&self) -> &Range { + &self.key_range + } + + pub(crate) fn lsn(&self) -> Lsn { + self.lsn + } + pub(super) async fn load( path: &Utf8Path, lsn: Lsn, summary: Option, max_vectored_read_bytes: Option, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path, ctx).await { - Ok(file) => file, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), - }; + ) -> anyhow::Result { + let file = VirtualFile::open(path, ctx) + .await + .context("open layer file")?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); - let summary_blk = match block_reader.read_blk(0, ctx).await { - Ok(blk) => blk, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), - }; + let summary_blk = block_reader + .read_blk(0, ctx) + .await + .context("read first block")?; // length is the only way how this could fail, so it's not actually likely at all unless // read_blk returns wrong sized block. @@ -413,7 +410,7 @@ impl ImageLayerInner { } } - Ok(Ok(ImageLayerInner { + Ok(ImageLayerInner { index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, lsn, @@ -421,47 +418,7 @@ impl ImageLayerInner { file_id, max_vectored_read_bytes, key_range: actual_summary.key_range, - })) - } - - pub(super) async fn get_value_reconstruct_data( - &self, - key: Key, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let tree_reader = - DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); - - let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; - key.write_to_byte_slice(&mut keybuf); - if let Some(offset) = tree_reader - .get( - &keybuf, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::ImageLayerBtreeNode) - .build(), - ) - .await? - { - let blob = block_reader - .block_cursor() - .read_blob( - offset, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::ImageLayerValue) - .build(), - ) - .await - .with_context(|| format!("failed to read value from offset {}", offset))?; - let value = Bytes::from(blob); - - reconstruct_state.img = Some((self.lsn, value)); - Ok(ValueReconstructResult::Complete) - } else { - Ok(ValueReconstructResult::Missing) - } + }) } // Look up the keys in the provided keyspace and update @@ -486,7 +443,6 @@ impl ImageLayerInner { } /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. - #[cfg(test)] pub(super) async fn load_key_values( &self, ctx: &RequestContext, @@ -495,7 +451,7 @@ impl ImageLayerInner { let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); let mut result = Vec::new(); - let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx)); + let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx)); let block_reader = FileBlockReader::new(&self.file, self.file_id); let cursor = block_reader.block_cursor(); while let Some(item) = stream.next().await { @@ -544,7 +500,7 @@ impl ImageLayerInner { let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; range.start.write_to_byte_slice(&mut search_key); - let index_stream = tree_reader.get_stream_from(&search_key, &ctx); + let index_stream = tree_reader.clone().into_stream(&search_key, &ctx); let mut index_stream = std::pin::pin!(index_stream); while let Some(index_entry) = index_stream.next().await { @@ -689,6 +645,23 @@ impl ImageLayerInner { }; } } + + pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); + ImageLayerIterator { + image_layer: self, + ctx, + index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx), + key_values_batch: VecDeque::new(), + is_end: false, + planner: StreamingVectoredReadPlanner::new( + 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. + 1024, // The default value. Unit tests might use a different value + ), + } + } } /// A builder object for constructing a new image layer. @@ -710,11 +683,32 @@ struct ImageLayerWriterInner { key_range: Range, lsn: Lsn, + // Total uncompressed bytes passed into put_image + uncompressed_bytes: u64, + + // Like `uncompressed_bytes`, + // but only of images we might consider for compression + uncompressed_bytes_eligible: u64, + + // Like `uncompressed_bytes`, but only of images + // where we have chosen their compressed form + uncompressed_bytes_chosen: u64, + + // Number of keys in the layer. + num_keys: usize, + blob_writer: BlobWriter, tree: DiskBtreeBuilder, + + #[cfg_attr(not(feature = "testing"), allow(dead_code))] + last_written_key: Key, } impl ImageLayerWriterInner { + fn size(&self) -> u64 { + self.tree.borrow_writer().size() + self.blob_writer.size() + } + /// /// Start building a new image layer. /// @@ -765,6 +759,11 @@ impl ImageLayerWriterInner { lsn, tree: tree_builder, blob_writer, + uncompressed_bytes: 0, + uncompressed_bytes_eligible: 0, + uncompressed_bytes_chosen: 0, + num_keys: 0, + last_written_key: Key::MIN, }; Ok(writer) @@ -782,14 +781,34 @@ impl ImageLayerWriterInner { ctx: &RequestContext, ) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); - let (_img, res) = self.blob_writer.write_blob(img, ctx).await; + let compression = self.conf.image_compression; + let uncompressed_len = img.len() as u64; + self.uncompressed_bytes += uncompressed_len; + self.num_keys += 1; + let (_img, res) = self + .blob_writer + .write_blob_maybe_compressed(img.slice_len(), ctx, compression) + .await; // TODO: re-use the buffer for `img` further upstack - let off = res?; + let (off, compression_info) = res?; + if compression_info.compressed_size.is_some() { + // The image has been considered for compression at least + self.uncompressed_bytes_eligible += uncompressed_len; + } + if compression_info.written_compressed { + // The image has been compressed + self.uncompressed_bytes_chosen += uncompressed_len; + } let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); self.tree.append(&keybuf, off)?; + #[cfg(feature = "testing")] + { + self.last_written_key = key; + } + Ok(()) } @@ -800,10 +819,19 @@ impl ImageLayerWriterInner { self, timeline: &Arc, ctx: &RequestContext, + end_key: Option, ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + // Calculate compression ratio + let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header + crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes); + crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED + .inc_by(self.uncompressed_bytes_eligible); + crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen); + crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size); + let mut file = self.blob_writer.into_inner(); // Write out the index @@ -811,7 +839,7 @@ impl ImageLayerWriterInner { .await?; let (index_root_blk, block_buf) = self.tree.finish()?; for buf in block_buf.blocks { - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; } @@ -831,7 +859,7 @@ impl ImageLayerWriterInner { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; let metadata = file @@ -842,11 +870,23 @@ impl ImageLayerWriterInner { let desc = PersistentLayerDesc::new_img( self.tenant_shard_id, self.timeline_id, - self.key_range.clone(), + if let Some(end_key) = end_key { + self.key_range.start..end_key + } else { + self.key_range.clone() + }, self.lsn, metadata.len(), ); + #[cfg(feature = "testing")] + if let Some(end_key) = end_key { + assert!( + self.last_written_key < end_key, + "written key violates end_key range" + ); + } + // Note: Because we open the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. @@ -923,6 +963,18 @@ impl ImageLayerWriter { self.inner.as_mut().unwrap().put_image(key, img, ctx).await } + #[cfg(test)] + /// Estimated size of the image layer. + pub(crate) fn estimated_size(&self) -> u64 { + let inner = self.inner.as_ref().unwrap(); + inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64 + } + + #[cfg(test)] + pub(crate) fn num_keys(&self) -> usize { + self.inner.as_ref().unwrap().num_keys + } + /// /// Finish writing the image layer. /// @@ -931,7 +983,26 @@ impl ImageLayerWriter { timeline: &Arc, ctx: &RequestContext, ) -> anyhow::Result { - self.inner.take().unwrap().finish(timeline, ctx).await + self.inner.take().unwrap().finish(timeline, ctx, None).await + } + + #[cfg(test)] + /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive. + pub(super) async fn finish_with_end_key( + mut self, + timeline: &Arc, + end_key: Key, + ctx: &RequestContext, + ) -> anyhow::Result { + self.inner + .take() + .unwrap() + .finish(timeline, ctx, Some(end_key)) + .await + } + + pub(crate) fn size(&self) -> u64 { + self.inner.as_ref().unwrap().size() } } @@ -943,11 +1014,78 @@ impl Drop for ImageLayerWriter { } } +pub struct ImageLayerIterator<'a> { + image_layer: &'a ImageLayerInner, + ctx: &'a RequestContext, + planner: StreamingVectoredReadPlanner, + index_iter: DiskBtreeIterator<'a>, + key_values_batch: VecDeque<(Key, Lsn, Value)>, + is_end: bool, +} + +impl<'a> ImageLayerIterator<'a> { + /// Retrieve a batch of key-value pairs into the iterator buffer. + async fn next_batch(&mut self) -> anyhow::Result<()> { + assert!(self.key_values_batch.is_empty()); + assert!(!self.is_end); + + let plan = loop { + if let Some(res) = self.index_iter.next().await { + let (raw_key, offset) = res?; + if let Some(batch_plan) = self.planner.handle( + Key::from_slice(&raw_key[..KEY_SIZE]), + self.image_layer.lsn, + offset, + ) { + break batch_plan; + } + } else { + self.is_end = true; + let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64; + if let Some(item) = self.planner.handle_range_end(payload_end) { + break item; + } else { + return Ok(()); // TODO: a test case on empty iterator + } + } + }; + let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file); + let mut next_batch = std::collections::VecDeque::new(); + let buf_size = plan.size(); + let buf = BytesMut::with_capacity(buf_size); + let blobs_buf = vectored_blob_reader + .read_blobs(&plan, buf, self.ctx) + .await?; + let frozen_buf: Bytes = blobs_buf.buf.freeze(); + for meta in blobs_buf.blobs.iter() { + let img_buf = frozen_buf.slice(meta.start..meta.end); + next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf))); + } + self.key_values_batch = next_batch; + Ok(()) + } + + pub async fn next(&mut self) -> anyhow::Result> { + if self.key_values_batch.is_empty() { + if self.is_end { + return Ok(None); + } + self.next_batch().await?; + } + Ok(Some( + self.key_values_batch + .pop_front() + .expect("should not be empty"), + )) + } +} + #[cfg(test)] mod test { - use std::time::Duration; + use std::{sync::Arc, time::Duration}; use bytes::Bytes; + use itertools::Itertools; use pageserver_api::{ key::Key, shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}, @@ -959,11 +1097,19 @@ mod test { }; use crate::{ - tenant::{config::TenantConf, harness::TenantHarness}, + context::RequestContext, + repository::Value, + tenant::{ + config::TenantConf, + harness::{TenantHarness, TIMELINE_ID}, + storage_layer::ResidentLayer, + vectored_blob_io::StreamingVectoredReadPlanner, + Tenant, Timeline, + }, DEFAULT_PG_VERSION, }; - use super::ImageLayerWriter; + use super::{ImageLayerIterator, ImageLayerWriter}; #[tokio::test] async fn image_layer_rewrite() { @@ -994,6 +1140,7 @@ mod test { ShardIdentity::unsharded(), get_next_gen(), ) + .await .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant @@ -1060,6 +1207,7 @@ mod test { // But here, all we care about is that the gen number is unique. get_next_gen(), ) + .await .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant @@ -1134,4 +1282,111 @@ mod test { } } } + + async fn produce_image_layer( + tenant: &Tenant, + tline: &Arc, + mut images: Vec<(Key, Bytes)>, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result { + images.sort(); + let (key_start, _) = images.first().unwrap(); + let (key_last, _) = images.last().unwrap(); + let key_end = key_last.next(); + let key_range = *key_start..key_end; + let mut writer = ImageLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + &key_range, + lsn, + ctx, + ) + .await?; + + for (key, img) in images { + writer.put_image(key, img, ctx).await?; + } + let img_layer = writer.finish(tline, ctx).await?; + + Ok::<_, anyhow::Error>(img_layer) + } + + async fn assert_img_iter_equal( + img_iter: &mut ImageLayerIterator<'_>, + expect: &[(Key, Bytes)], + expect_lsn: Lsn, + ) { + let mut expect_iter = expect.iter(); + loop { + let o1 = img_iter.next().await.unwrap(); + let o2 = expect_iter.next(); + match (o1, o2) { + (None, None) => break, + (Some((k1, l1, v1)), Some((k2, i2))) => { + let Value::Image(i1) = v1 else { + panic!("expect Value::Image") + }; + assert_eq!(&k1, k2); + assert_eq!(l1, expect_lsn); + assert_eq!(&i1, i2); + } + (o1, o2) => panic!("iterators length mismatch: {:?}, {:?}", o1, o2), + } + } + } + + #[tokio::test] + async fn image_layer_iterator() { + let harness = TenantHarness::create("image_layer_iterator").await.unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + const N: usize = 1000; + let test_imgs = (0..N) + .map(|idx| (get_key(idx as u32), Bytes::from(format!("img{idx:05}")))) + .collect_vec(); + let resident_layer = + produce_image_layer(&tenant, &tline, test_imgs.clone(), Lsn(0x10), &ctx) + .await + .unwrap(); + let img_layer = resident_layer.get_as_image(&ctx).await.unwrap(); + for max_read_size in [1, 1024] { + for batch_size in [1, 2, 4, 8, 3, 7, 13] { + println!("running with batch_size={batch_size} max_read_size={max_read_size}"); + // Test if the batch size is correctly determined + let mut iter = img_layer.iter(&ctx); + iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut num_items = 0; + for _ in 0..3 { + iter.next_batch().await.unwrap(); + num_items += iter.key_values_batch.len(); + if max_read_size == 1 { + // every key should be a batch b/c the value is larger than max_read_size + assert_eq!(iter.key_values_batch.len(), 1); + } else { + assert_eq!(iter.key_values_batch.len(), batch_size); + } + if num_items >= N { + break; + } + iter.key_values_batch.clear(); + } + // Test if the result is correct + let mut iter = img_layer.iter(&ctx); + iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await; + } + } + } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 1ecc56ce99..a71b4dd83b 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -6,18 +6,21 @@ //! use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; +use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; -use crate::tenant::block_io::BlockReader; +use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::ephemeral_file::EphemeralFile; -use crate::tenant::storage_layer::ValueReconstructResult; use crate::tenant::timeline::GetVectoredError; -use crate::tenant::{PageReconstructError, Timeline}; -use crate::{page_cache, walrecord}; -use anyhow::{anyhow, ensure, Result}; +use crate::tenant::PageReconstructError; +use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; +use crate::{l0_flush, page_cache}; +use anyhow::{anyhow, Result}; +use camino::Utf8PathBuf; +use pageserver_api::key::CompactKey; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use std::collections::{BTreeMap, BinaryHeap, HashSet}; +use std::collections::BTreeMap; use std::sync::{Arc, OnceLock}; use std::time::Instant; use tracing::*; @@ -30,11 +33,10 @@ use std::fmt::Write; use std::ops::Range; use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::atomic::{AtomicU64, AtomicUsize}; -use tokio::sync::{RwLock, RwLockWriteGuard}; +use tokio::sync::RwLock; use super::{ - DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState, - ValuesReconstructState, + DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState, }; #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] @@ -54,9 +56,6 @@ pub struct InMemoryLayer { /// Writes are only allowed when this is `None`. pub(crate) end_lsn: OnceLock, - /// Used for traversal path. Cached representation of the in-memory layer before frozen. - local_path_str: Arc, - /// Used for traversal path. Cached representation of the in-memory layer after frozen. frozen_local_path_str: OnceLock>, @@ -81,7 +80,7 @@ pub struct InMemoryLayerInner { /// All versions of all pages in the layer are kept here. Indexed /// by block number and LSN. The value is an offset into the /// ephemeral file where the page version is stored. - index: BTreeMap>, + index: BTreeMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. @@ -247,18 +246,10 @@ impl InMemoryLayer { self.start_lsn..self.end_lsn_or_max() } - pub(crate) fn local_path_str(&self) -> &Arc { - self.frozen_local_path_str - .get() - .unwrap_or(&self.local_path_str) - } - /// debugging function to print out the contents of the layer /// /// this is likely completly unused - pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { - let inner = self.inner.read().await; - + pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { let end_str = self.end_lsn_or_max(); println!( @@ -266,96 +257,9 @@ impl InMemoryLayer { self.timeline_id, self.start_lsn, end_str, ); - if !verbose { - return Ok(()); - } - - let cursor = inner.file.block_cursor(); - let mut buf = Vec::new(); - for (key, vec_map) in inner.index.iter() { - for (lsn, pos) in vec_map.as_slice() { - let mut desc = String::new(); - cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?; - let val = Value::des(&buf); - match val { - Ok(Value::Image(img)) => { - write!(&mut desc, " img {} bytes", img.len())?; - } - Ok(Value::WalRecord(rec)) => { - let wal_desc = walrecord::describe_wal_record(&rec).unwrap(); - write!( - &mut desc, - " rec {} bytes will_init: {} {}", - buf.len(), - rec.will_init(), - wal_desc - )?; - } - Err(err) => { - write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?; - } - } - println!(" key {} at {}: {}", key, lsn, desc); - } - } - Ok(()) } - /// Look up given value in the layer. - pub(crate) async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - ensure!(lsn_range.start >= self.start_lsn); - let mut need_image = true; - - let ctx = RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::InMemoryLayer) - .build(); - - let inner = self.inner.read().await; - - let reader = inner.file.block_cursor(); - - // Scan the page versions backwards, starting from `lsn`. - if let Some(vec_map) = inner.index.get(&key) { - let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, pos) in slice.iter().rev() { - let buf = reader.read_blob(*pos, &ctx).await?; - let value = Value::des(&buf)?; - match value { - Value::Image(img) => { - reconstruct_state.img = Some((*entry_lsn, img)); - return Ok(ValueReconstructResult::Complete); - } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((*entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } - } - } - } - - // release lock on 'inner' - - // If an older page image is needed to reconstruct the page, let the - // caller know. - if need_image { - Ok(ValueReconstructResult::Continue) - } else { - Ok(ValueReconstructResult::Complete) - } - } - // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. // @@ -374,71 +278,124 @@ impl InMemoryLayer { let inner = self.inner.read().await; let reader = inner.file.block_cursor(); - #[derive(Eq, PartialEq, Ord, PartialOrd)] - struct BlockRead { - key: Key, - lsn: Lsn, - block_offset: u64, - } - - let mut planned_block_reads = BinaryHeap::new(); - for range in keyspace.ranges.iter() { - for (key, vec_map) in inner.index.range(range.start..range.end) { - let lsn_range = match reconstruct_state.get_cached_lsn(key) { + for (key, vec_map) in inner + .index + .range(range.start.to_compact()..range.end.to_compact()) + { + let key = Key::from_compact(*key); + let lsn_range = match reconstruct_state.get_cached_lsn(&key) { Some(cached_lsn) => (cached_lsn + 1)..end_lsn, None => self.start_lsn..end_lsn, }; let slice = vec_map.slice_range(lsn_range); + for (entry_lsn, pos) in slice.iter().rev() { - planned_block_reads.push(BlockRead { - key: *key, - lsn: *entry_lsn, - block_offset: *pos, - }); + // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 + let buf = reader.read_blob(*pos, &ctx).await; + if let Err(e) = buf { + reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); + break; + } + + let value = Value::des(&buf.unwrap()); + if let Err(e) = value { + reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); + break; + } + + let key_situation = + reconstruct_state.update_key(&key, *entry_lsn, value.unwrap()); + if key_situation == ValueReconstructSituation::Complete { + break; + } } } } - let keyspace_size = keyspace.total_raw_size(); - - let mut completed_keys = HashSet::new(); - while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() { - let block_read = planned_block_reads.pop().unwrap(); - if completed_keys.contains(&block_read.key) { - continue; - } - - let buf = reader.read_blob(block_read.block_offset, &ctx).await; - if let Err(e) = buf { - reconstruct_state - .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e))); - completed_keys.insert(block_read.key); - continue; - } - - let value = Value::des(&buf.unwrap()); - if let Err(e) = value { - reconstruct_state - .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e))); - completed_keys.insert(block_read.key); - continue; - } - - let key_situation = - reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap()); - if key_situation == ValueReconstructSituation::Complete { - completed_keys.insert(block_read.key); - } - } - reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn); Ok(()) } } +/// Offset of a particular Value within a serialized batch. +struct SerializedBatchOffset { + key: CompactKey, + lsn: Lsn, + /// offset in bytes from the start of the batch's buffer to the Value's serialized size header. + offset: u64, +} + +pub struct SerializedBatch { + /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`]. + pub(crate) raw: Vec, + + /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer. + offsets: Vec, + + /// The highest LSN of any value in the batch + pub(crate) max_lsn: Lsn, +} + +impl SerializedBatch { + /// Write a blob length in the internal format of the EphemeralFile + pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor>) { + use std::io::Write; + + if len < 0x80 { + // short one-byte length header + let len_buf = [len as u8]; + + cursor + .write_all(&len_buf) + .expect("Writing to Vec is infallible"); + } else { + let mut len_buf = u32::to_be_bytes(len as u32); + len_buf[0] |= 0x80; + cursor + .write_all(&len_buf) + .expect("Writing to Vec is infallible"); + } + } + + pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self { + // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by + // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`] + let buffer_size = batch.iter().map(|i| i.2).sum::() + 4 * batch.len(); + let mut cursor = std::io::Cursor::new(Vec::::with_capacity(buffer_size)); + + let mut offsets: Vec = Vec::with_capacity(batch.len()); + let mut max_lsn: Lsn = Lsn(0); + for (key, lsn, val_ser_size, val) in batch { + let relative_off = cursor.position(); + + Self::write_blob_length(val_ser_size, &mut cursor); + val.ser_into(&mut cursor) + .expect("Writing into in-memory buffer is infallible"); + + offsets.push(SerializedBatchOffset { + key, + lsn, + offset: relative_off, + }); + max_lsn = std::cmp::max(max_lsn, lsn); + } + + let buffer = cursor.into_inner(); + + // Assert that we didn't do any extra allocations while building buffer. + debug_assert!(buffer.len() <= buffer_size); + + Self { + raw: buffer, + offsets, + max_lsn, + } + } +} + fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result { write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0) } @@ -473,20 +430,17 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, + gate_guard: utils::sync::gate::GateGuard, ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?; + let file = + EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { file_id: key, - local_path_str: { - let mut buf = String::new(); - inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap(); - buf.into() - }, frozen_local_path_str: OnceLock::new(), conf, timeline_id, @@ -502,38 +456,20 @@ impl InMemoryLayer { }) } - // Write operations - - /// Common subroutine of the public put_wal_record() and put_page_image() functions. - /// Adds the page version to the in-memory tree - - pub(crate) async fn put_value( + // Write path. + pub async fn put_batch( &self, - key: Key, - lsn: Lsn, - buf: &[u8], + serialized_batch: SerializedBatch, ctx: &RequestContext, ) -> Result<()> { let mut inner = self.inner.write().await; self.assert_writable(); - self.put_value_locked(&mut inner, key, lsn, buf, ctx).await - } - async fn put_value_locked( - &self, - locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>, - key: Key, - lsn: Lsn, - buf: &[u8], - ctx: &RequestContext, - ) -> Result<()> { - trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); - - let off = { - locked_inner + let base_off = { + inner .file - .write_blob( - buf, + .write_raw( + &serialized_batch.raw, &RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::InMemoryLayer) .build(), @@ -541,15 +477,23 @@ impl InMemoryLayer { .await? }; - let vec_map = locked_inner.index.entry(key).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; - if old.is_some() { - // We already had an entry for this LSN. That's odd.. - warn!("Key {} at {} already exists", key, lsn); + for SerializedBatchOffset { + key, + lsn, + offset: relative_off, + } in serialized_batch.offsets + { + let off = base_off + relative_off; + let vec_map = inner.index.entry(key).or_default(); + let old = vec_map.append_or_update_last(lsn, off).unwrap().0; + if old.is_some() { + // We already had an entry for this LSN. That's odd.. + warn!("Key {} at {} already exists", key, lsn); + } } - let size = locked_inner.file.len(); - locked_inner.resource_units.maybe_publish_size(size); + let size = inner.file.len(); + inner.resource_units.maybe_publish_size(size); Ok(()) } @@ -572,8 +516,6 @@ impl InMemoryLayer { /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive pub async fn freeze(&self, end_lsn: Lsn) { - let inner = self.inner.write().await; - assert!( self.start_lsn < end_lsn, "{} >= {}", @@ -591,9 +533,13 @@ impl InMemoryLayer { }) .expect("frozen_local_path_str set only once"); - for vec_map in inner.index.values() { - for (lsn, _pos) in vec_map.as_slice() { - assert!(*lsn < end_lsn); + #[cfg(debug_assertions)] + { + let inner = self.inner.write().await; + for vec_map in inner.index.values() { + for (lsn, _pos) in vec_map.as_slice() { + assert!(*lsn < end_lsn); + } } } } @@ -603,12 +549,12 @@ impl InMemoryLayer { /// if there are no matching keys. /// /// Returns a new delta layer with all the same data as this in-memory layer - pub(crate) async fn write_to_disk( + pub async fn write_to_disk( &self, - timeline: &Arc, ctx: &RequestContext, key_range: Option>, - ) -> Result> { + l0_flush_global_state: &l0_flush::Inner, + ) -> Result> { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception @@ -620,20 +566,25 @@ impl InMemoryLayer { // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().await; + use l0_flush::Inner; + let _concurrency_permit = match l0_flush_global_state { + Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await), + }; + let end_lsn = *self.end_lsn.get().unwrap(); - let keys: Vec<_> = if let Some(key_range) = key_range { + let key_count = if let Some(key_range) = key_range { + let key_range = key_range.start.to_compact()..key_range.end.to_compact(); + inner .index .iter() .filter(|(k, _)| key_range.contains(k)) - .map(|(k, m)| (k.to_i128(), m)) - .collect() + .count() } else { - inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect() + inner.index.len() }; - - if keys.is_empty() { + if key_count == 0 { return Ok(None); } @@ -647,28 +598,67 @@ impl InMemoryLayer { ) .await?; - let mut buf = Vec::new(); + match l0_flush_global_state { + l0_flush::Inner::Direct { .. } => { + let file_contents: Vec = inner.file.load_to_vec(ctx).await?; + assert_eq!( + file_contents.len() % PAGE_SZ, + 0, + "needed by BlockReaderRef::Slice" + ); + assert_eq!(file_contents.len(), { + let written = usize::try_from(inner.file.len()).unwrap(); + if written % PAGE_SZ == 0 { + written + } else { + written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap() + } + }); - let cursor = inner.file.block_cursor(); + let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents)); - let ctx = RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::InMemoryLayer) - .build(); - for (key, vec_map) in inner.index.iter() { - // Write all page versions - for (lsn, pos) in vec_map.as_slice() { - cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; - let will_init = Value::des(&buf)?.will_init(); - let res; - (buf, res) = delta_layer_writer - .put_value_bytes(*key, *lsn, buf, will_init, &ctx) - .await; - res?; + let mut buf = Vec::new(); + + for (key, vec_map) in inner.index.iter() { + // Write all page versions + for (lsn, pos) in vec_map.as_slice() { + // TODO: once we have blob lengths in the in-memory index, we can + // 1. get rid of the blob_io / BlockReaderRef::Slice business and + // 2. load the file contents into a Bytes and + // 3. the use `Bytes::slice` to get the `buf` that is our blob + // 4. pass that `buf` into `put_value_bytes` + // => https://github.com/neondatabase/neon/issues/8183 + cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?; + let will_init = Value::des(&buf)?.will_init(); + let (tmp, res) = delta_layer_writer + .put_value_bytes( + Key::from_compact(*key), + *lsn, + buf.slice_len(), + will_init, + ctx, + ) + .await; + res?; + buf = tmp.into_raw_slice().into_inner(); + } + } } } // MAX is used here because we identify L0 layers by full key range - let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?; - Ok(Some(delta_layer)) + let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?; + + // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``. + // + // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of + // the `file_contents: Vec` until the IO is done, but not the permit's lifetime. + // Thus, we'd have more concurrenct `Vec` in existence than the semaphore allows. + // + // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages + // we dirtied when writing to the filesystem have been flushed and marked !dirty. + drop(_concurrency_permit); + + Ok(Some((desc, path))) } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 32acb3f0cd..774f97e1d9 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1,9 +1,7 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::{ - HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, -}; +use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; @@ -19,14 +17,14 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::repository::Key; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::task_mgr::TaskKind; -use crate::tenant::timeline::GetVectoredError; +use crate::tenant::timeline::{CompactionError, GetVectoredError}; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; use super::image_layer::{self}; use super::{ AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, - PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, + LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState, }; use utils::generation::Generation; @@ -93,16 +91,12 @@ pub(crate) struct Layer(Arc); impl std::fmt::Display for Layer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if matches!(self.0.generation, Generation::Broken) { - write!(f, "{}-broken", self.layer_desc().short_id()) - } else { - write!( - f, - "{}{}", - self.layer_desc().short_id(), - self.0.generation.get_suffix() - ) - } + write!( + f, + "{}{}", + self.layer_desc().short_id(), + self.0.generation.get_suffix() + ) } } @@ -164,13 +158,10 @@ impl Layer { metadata.file_size, ); - let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted); - let owner = Layer(Arc::new(LayerInner::new( conf, timeline, local_path, - access_stats, desc, None, metadata.generation, @@ -197,8 +188,6 @@ impl Layer { metadata.file_size, ); - let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident); - let mut resident = None; let owner = Layer(Arc::new_cyclic(|owner| { @@ -213,7 +202,6 @@ impl Layer { conf, timeline, local_path, - access_stats, desc, Some(inner), metadata.generation, @@ -249,11 +237,6 @@ impl Layer { version: 0, }); resident = Some(inner.clone()); - let access_stats = LayerAccessStats::empty_will_record_residence_event_later(); - access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::LayerCreate, - ); let local_path = local_layer_path( conf, @@ -267,7 +250,6 @@ impl Layer { conf, timeline, local_path, - access_stats, desc, Some(inner), timeline.generation, @@ -318,44 +300,6 @@ impl Layer { self.0.delete_on_drop(); } - /// Return data needed to reconstruct given page at LSN. - /// - /// It is up to the caller to collect more data from the previous layer and - /// perform WAL redo, if necessary. - /// - /// # Cancellation-Safety - /// - /// This method is cancellation-safe. - pub(crate) async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_data: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - use anyhow::ensure; - - let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?; - self.0 - .access_stats - .record_access(LayerAccessKind::GetValueReconstructData, ctx); - - if self.layer_desc().is_delta { - ensure!(lsn_range.start >= self.layer_desc().lsn_range.start); - ensure!(self.layer_desc().key_range.contains(&key)); - } else { - ensure!(self.layer_desc().key_range.contains(&key)); - ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn()); - ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn()); - } - - layer - .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx) - .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self)) - .await - .with_context(|| format!("get_value_reconstruct_data for layer {self}")) - } - pub(crate) async fn get_values_reconstruct_data( &self, keyspace: KeySpace, @@ -368,13 +312,13 @@ impl Layer { .get_or_maybe_download(true, Some(ctx)) .await .map_err(|err| match err { - DownloadError::DownloadCancelled => GetVectoredError::Cancelled, + DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { + GetVectoredError::Cancelled + } other => GetVectoredError::Other(anyhow::anyhow!(other)), })?; - self.0 - .access_stats - .record_access(LayerAccessKind::GetValueReconstructData, ctx); + self.record_access(ctx); layer .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) @@ -389,7 +333,7 @@ impl Layer { } /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future. - #[cfg(test)] + #[allow(dead_code)] pub(crate) async fn load_key_values( &self, ctx: &RequestContext, @@ -441,7 +385,7 @@ impl Layer { } /// Downloads if necessary and creates a guard, which will keep this layer from being evicted. - pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result { + pub(crate) async fn download_and_keep_resident(&self) -> Result { let downloaded = self.0.get_or_maybe_download(true, None).await?; Ok(ResidentLayer { @@ -454,18 +398,18 @@ impl Layer { self.0.info(reset) } - pub(crate) fn access_stats(&self) -> &LayerAccessStats { - &self.0.access_stats + pub(crate) fn latest_activity(&self) -> SystemTime { + self.0.access_stats.latest_activity() + } + + pub(crate) fn visibility(&self) -> LayerVisibilityHint { + self.0.access_stats.visibility() } pub(crate) fn local_path(&self) -> &Utf8Path { &self.0.path } - pub(crate) fn debug_str(&self) -> &Arc { - &self.0.debug_str - } - pub(crate) fn metadata(&self) -> LayerFileMetadata { self.0.metadata() } @@ -508,13 +452,57 @@ impl Layer { } } } + + fn record_access(&self, ctx: &RequestContext) { + if self.0.access_stats.record_access(ctx) { + // Visibility was modified to Visible + tracing::info!( + "Layer {} became visible as a result of access", + self.0.desc.key() + ); + if let Some(tl) = self.0.timeline.upgrade() { + tl.metrics + .visible_physical_size_gauge + .add(self.0.desc.file_size) + } + } + } + + pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) { + let old_visibility = self.0.access_stats.set_visibility(visibility.clone()); + use LayerVisibilityHint::*; + match (old_visibility, visibility) { + (Visible, Covered) => { + // Subtract this layer's contribution to the visible size metric + if let Some(tl) = self.0.timeline.upgrade() { + debug_assert!( + tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size + ); + tl.metrics + .visible_physical_size_gauge + .sub(self.0.desc.file_size) + } + } + (Covered, Visible) => { + // Add this layer's contribution to the visible size metric + if let Some(tl) = self.0.timeline.upgrade() { + tl.metrics + .visible_physical_size_gauge + .add(self.0.desc.file_size) + } + } + (Covered, Covered) | (Visible, Visible) => { + // no change + } + } + } } /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted. /// /// However when we want something evicted, we cannot evict it right away as there might be current /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet -/// read with [`Layer::get_value_reconstruct_data`]. +/// read with [`Layer::get_values_reconstruct_data`]. /// /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search #[derive(Debug)] @@ -595,9 +583,6 @@ struct LayerInner { /// Full path to the file; unclear if this should exist anymore. path: Utf8PathBuf, - /// String representation of the layer, used for traversal id. - debug_str: Arc, - desc: PersistentLayerDesc, /// Timeline access is needed for remote timeline client and metrics. @@ -698,6 +683,28 @@ impl Drop for LayerInner { // and we could be delaying shutdown for nothing. } + if let Some(timeline) = self.timeline.upgrade() { + // Only need to decrement metrics if the timeline still exists: otherwise + // it will have already de-registered these metrics via TimelineMetrics::shutdown + if self.desc.is_delta() { + timeline.metrics.layer_count_delta.dec(); + timeline.metrics.layer_size_delta.sub(self.desc.file_size); + } else { + timeline.metrics.layer_count_image.dec(); + timeline.metrics.layer_size_image.sub(self.desc.file_size); + } + + if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) { + debug_assert!( + timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size + ); + timeline + .metrics + .visible_physical_size_gauge + .sub(self.desc.file_size); + } + } + if !*self.wanted_deleted.get_mut() { return; } @@ -778,7 +785,6 @@ impl LayerInner { conf: &'static PageServerConf, timeline: &Arc, local_path: Utf8PathBuf, - access_stats: LayerAccessStats, desc: PersistentLayerDesc, downloaded: Option>, generation: Generation, @@ -796,15 +802,27 @@ impl LayerInner { (heavier_once_cell::OnceCell::default(), 0, Status::Evicted) }; + // This object acts as a RAII guard on these metrics: increment on construction + if desc.is_delta() { + timeline.metrics.layer_count_delta.inc(); + timeline.metrics.layer_size_delta.add(desc.file_size); + } else { + timeline.metrics.layer_count_image.inc(); + timeline.metrics.layer_size_image.add(desc.file_size); + } + + // New layers are visible by default. This metric is later updated on drop or in set_visibility + timeline + .metrics + .visible_physical_size_gauge + .add(desc.file_size); + LayerInner { conf, - debug_str: { - format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into() - }, path: local_path, desc, timeline: Arc::downgrade(timeline), - access_stats, + access_stats: Default::default(), wanted_deleted: AtomicBool::new(false), inner, version: AtomicUsize::new(version), @@ -1101,19 +1119,10 @@ impl LayerInner { match rx.await { Ok(Ok(res)) => Ok(res), - Ok(Err(e)) => { - // sleep already happened in the spawned task, if it was not cancelled - match e.downcast_ref::() { - // If the download failed due to its cancellation token, - // propagate the cancellation error upstream. - Some(remote_storage::DownloadError::Cancelled) => { - Err(DownloadError::DownloadCancelled) - } - // FIXME: this is not embedding the error because historically it would had - // been output to compute, however that is no longer the case. - _ => Err(DownloadError::DownloadFailed), - } + Ok(Err(remote_storage::DownloadError::Cancelled)) => { + Err(DownloadError::DownloadCancelled) } + Ok(Err(_)) => Err(DownloadError::DownloadFailed), Err(_gone) => Err(DownloadError::DownloadCancelled), } } @@ -1123,7 +1132,7 @@ impl LayerInner { timeline: Arc, permit: heavier_once_cell::InitPermit, ctx: &RequestContext, - ) -> anyhow::Result> { + ) -> Result, remote_storage::DownloadError> { let result = timeline .remote_client .download_layer_file( @@ -1168,10 +1177,7 @@ impl LayerInner { LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); } - self.access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::ResidenceChange, - ); + self.access_stats.record_residence_event(); Ok(self.initialize_after_layer_is_on_disk(permit)) } @@ -1290,7 +1296,7 @@ impl LayerInner { lsn_end: lsn_range.end, remote: !resident, access_stats, - l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()), + l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range), } } else { let lsn = self.desc.image_layer_lsn(); @@ -1483,14 +1489,22 @@ impl LayerInner { let duration = SystemTime::now().duration_since(local_layer_mtime); match duration { Ok(elapsed) => { - timeline - .metrics - .evictions_with_low_residence_duration - .read() - .unwrap() - .observe(elapsed); + let accessed = self.access_stats.accessed(); + if accessed { + // Only layers used for reads contribute to our "low residence" metric that is used + // to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed + // to be rapidly evicted without contributing to this metric. + timeline + .metrics + .evictions_with_low_residence_duration + .read() + .unwrap() + .observe(elapsed); + } + tracing::info!( residence_millis = elapsed.as_millis(), + accessed, "evicted layer after known residence period" ); } @@ -1517,10 +1531,7 @@ impl LayerInner { } } - self.access_stats.record_residence_event( - LayerResidenceStatus::Evicted, - LayerResidenceEventReason::ResidenceChange, - ); + self.access_stats.record_residence_event(); self.status.as_ref().unwrap().send_replace(Status::Evicted); @@ -1603,6 +1614,12 @@ pub(crate) enum DownloadError { Failpoint(failpoints::FailpointKind), } +impl DownloadError { + pub(crate) fn is_cancelled(&self) -> bool { + matches!(self, DownloadError::DownloadCancelled) + } +} + #[derive(Debug, PartialEq)] pub(crate) enum NeedsDownload { NotFound, @@ -1653,8 +1670,9 @@ impl Drop for DownloadedLayer { } impl DownloadedLayer { - /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to - /// initialize it permanently. + /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`]. + /// Failure to load the layer is sticky, i.e., future `get()` calls will return + /// the initial load failure immediately. /// /// `owner` parameter is a strong reference at the same `LayerInner` as the /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called, @@ -1685,7 +1703,7 @@ impl DownloadedLayer { ctx, ) .await - .map(|res| res.map(LayerKind::Delta)) + .map(LayerKind::Delta) } else { let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( @@ -1702,54 +1720,29 @@ impl DownloadedLayer { ctx, ) .await - .map(|res| res.map(LayerKind::Image)) + .map(LayerKind::Image) }; match res { - Ok(Ok(layer)) => Ok(Ok(layer)), - Ok(Err(transient)) => Err(transient), - Err(permanent) => { + Ok(layer) => Ok(layer), + Err(err) => { LAYER_IMPL_METRICS.inc_permanent_loading_failures(); - // TODO(#5815): we are not logging all errors, so temporarily log them **once** - // here as well - let permanent = permanent.context("load layer"); - tracing::error!("layer loading failed permanently: {permanent:#}"); - Ok(Err(permanent)) + // We log this message once over the lifetime of `Self` + // => Ok and good to log backtrace and path here. + tracing::error!( + "layer load failed, assuming permanent failure: {}: {err:?}", + owner.path + ); + Err(err) } } }; self.kind - .get_or_try_init(init) - // return transient errors using `?` - .await? + .get_or_init(init) + .await .as_ref() - .map_err(|e| { - // errors are not clonabled, cannot but stringify - // test_broken_timeline matches this string - anyhow::anyhow!("layer loading failed: {e:#}") - }) - } - - async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_data: &mut ValueReconstructState, - owner: &Arc, - ctx: &RequestContext, - ) -> anyhow::Result { - use LayerKind::*; - - match self.get(owner, ctx).await? { - Delta(d) => { - d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx) - .await - } - Image(i) => { - i.get_value_reconstruct_data(key, reconstruct_data, ctx) - .await - } - } + // We already logged the full backtrace above, once. Don't repeat that here. + .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}")) } async fn get_values_reconstruct_data( @@ -1762,7 +1755,11 @@ impl DownloadedLayer { ) -> Result<(), GetVectoredError> { use LayerKind::*; - match self.get(owner, ctx).await.map_err(GetVectoredError::from)? { + match self + .get(owner, ctx) + .await + .map_err(GetVectoredError::Other)? + { Delta(d) => { d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx) .await @@ -1774,7 +1771,6 @@ impl DownloadedLayer { } } - #[cfg(test)] async fn load_key_values( &self, owner: &Arc, @@ -1847,9 +1843,7 @@ impl ResidentLayer { // this is valid because the DownloadedLayer::kind is a OnceCell, not a // Mutex, so we cannot go and deinitialize the value with OnceCell::take // while it's being held. - owner - .access_stats - .record_access(LayerAccessKind::KeyIter, ctx); + self.owner.record_access(ctx); delta_layer::DeltaLayerInner::load_keys(d, ctx) .await @@ -1862,17 +1856,29 @@ impl ResidentLayer { /// Read all they keys in this layer which match the ShardIdentity, and write them all to /// the provided writer. Return the number of keys written. #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] - pub(crate) async fn filter<'a>( - &'a self, + pub(crate) async fn filter( + &self, shard_identity: &ShardIdentity, writer: &mut ImageLayerWriter, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { use LayerKind::*; - match self.downloaded.get(&self.owner.0, ctx).await? { - Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")), - Image(i) => i.filter(shard_identity, writer, ctx).await, + match self + .downloaded + .get(&self.owner.0, ctx) + .await + .map_err(CompactionError::Other)? + { + Delta(_) => { + return Err(CompactionError::Other(anyhow::anyhow!(format!( + "cannot filter() on a delta layer {self}" + )))); + } + Image(i) => i + .filter(shard_identity, writer, ctx) + .await + .map_err(CompactionError::Other), } } @@ -1904,8 +1910,8 @@ impl ResidentLayer { self.owner.metadata() } - #[cfg(test)] - pub(crate) async fn as_delta( + /// Cast the layer to a delta, return an error if it is an image layer. + pub(crate) async fn get_as_delta( &self, ctx: &RequestContext, ) -> anyhow::Result<&delta_layer::DeltaLayerInner> { @@ -1915,6 +1921,18 @@ impl ResidentLayer { Image(_) => Err(anyhow::anyhow!("image layer")), } } + + /// Cast the layer to an image, return an error if it is a delta layer. + pub(crate) async fn get_as_image( + &self, + ctx: &RequestContext, + ) -> anyhow::Result<&image_layer::ImageLayerInner> { + use LayerKind::*; + match self.downloaded.get(&self.owner.0, ctx).await? { + Image(ref d) => Ok(d), + Delta(_) => Err(anyhow::anyhow!("delta layer")), + } + } } impl AsLayerDesc for ResidentLayer { diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 3a7aca7a6c..bffd2db800 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,3 +1,5 @@ +use std::time::UNIX_EPOCH; + use pageserver_api::key::CONTROLFILE_KEY; use tokio::task::JoinSet; use utils::{ @@ -7,7 +9,7 @@ use utils::{ use super::failpoints::{Failpoint, FailpointKind}; use super::*; -use crate::context::DownloadBehavior; +use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint}; use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; /// Used in tests to advance a future to wanted await point, and not futher. @@ -22,7 +24,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s async fn smoke_test() { let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("smoke_test").unwrap(); + let h = TenantHarness::create("smoke_test").await.unwrap(); let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let (tenant, _) = h.load().await; @@ -37,7 +39,7 @@ async fn smoke_test() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -48,13 +50,26 @@ async fn smoke_test() { // all layers created at pageserver are like `layer`, initialized with strong // Arc. + let controlfile_keyspace = KeySpace { + ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()], + }; + let img_before = { - let mut data = ValueReconstructState::default(); + let mut data = ValuesReconstructState::default(); layer - .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .get_values_reconstruct_data( + controlfile_keyspace.clone(), + Lsn(0x10)..Lsn(0x11), + &mut data, + &ctx, + ) .await .unwrap(); - data.img + data.keys + .remove(&CONTROLFILE_KEY) + .expect("must be present") + .expect("should not error") + .img .take() .expect("tenant harness writes the control file") }; @@ -72,13 +87,24 @@ async fn smoke_test() { // on accesses when the layer is evicted, it will automatically be downloaded. let img_after = { - let mut data = ValueReconstructState::default(); + let mut data = ValuesReconstructState::default(); layer - .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .get_values_reconstruct_data( + controlfile_keyspace.clone(), + Lsn(0x10)..Lsn(0x11), + &mut data, + &ctx, + ) .instrument(download_span.clone()) .await .unwrap(); - data.img.take().unwrap() + data.keys + .remove(&CONTROLFILE_KEY) + .expect("must be present") + .expect("should not error") + .img + .take() + .expect("tenant harness writes the control file") }; assert_eq!(img_before, img_after); @@ -150,7 +176,7 @@ async fn smoke_test() { { let layers = &[layer]; let mut g = timeline.layers.write().await; - g.finish_gc_timeline(layers); + g.open_mut().unwrap().finish_gc_timeline(layers); // this just updates the remote_physical_size for demonstration purposes rtc.schedule_gc_update(layers).unwrap(); } @@ -176,7 +202,9 @@ async fn evict_and_wait_on_wanted_deleted() { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap(); + let h = TenantHarness::create("evict_and_wait_on_wanted_deleted") + .await + .unwrap(); utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); let (tenant, ctx) = h.load().await; @@ -188,7 +216,7 @@ async fn evict_and_wait_on_wanted_deleted() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -232,7 +260,7 @@ async fn evict_and_wait_on_wanted_deleted() { // the deletion of the layer in remote_storage happens. { let mut layers = timeline.layers.write().await; - layers.finish_gc_timeline(&[layer]); + layers.open_mut().unwrap().finish_gc_timeline(&[layer]); } SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; @@ -258,7 +286,9 @@ fn read_wins_pending_eviction() { rt.block_on(async move { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("read_wins_pending_eviction").unwrap(); + let h = TenantHarness::create("read_wins_pending_eviction") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); @@ -271,7 +301,7 @@ fn read_wins_pending_eviction() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -390,7 +420,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { rt.block_on(async move { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create(name).unwrap(); + let h = TenantHarness::create(name).await.unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); @@ -403,7 +433,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -559,8 +589,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { #[tokio::test(start_paused = true)] async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let handle = tokio::runtime::Handle::current(); - let h = - TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap(); + let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let timeline = tenant @@ -571,7 +602,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -636,7 +667,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { #[tokio::test(start_paused = true)] async fn evict_and_wait_does_not_wait_for_download() { // let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap(); + let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); @@ -649,7 +682,7 @@ async fn evict_and_wait_does_not_wait_for_download() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -733,7 +766,9 @@ async fn eviction_cancellation_on_drop() { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap(); + let h = TenantHarness::create("eviction_cancellation_on_drop") + .await + .unwrap(); utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); let (tenant, ctx) = h.load().await; @@ -766,9 +801,9 @@ async fn eviction_cancellation_on_drop() { let (evicted_layer, not_evicted) = { let mut layers = { let mut guard = timeline.layers.write().await; - let layers = guard.likely_resident_layers().collect::>(); + let layers = guard.likely_resident_layers().cloned().collect::>(); // remove the layers from layermap - guard.finish_gc_timeline(&layers); + guard.open_mut().unwrap().finish_gc_timeline(&layers); layers }; @@ -817,9 +852,9 @@ async fn eviction_cancellation_on_drop() { #[test] #[cfg(target_arch = "x86_64")] fn layer_size() { - assert_eq!(std::mem::size_of::(), 2040); - assert_eq!(std::mem::size_of::(), 104); - assert_eq!(std::mem::size_of::(), 2344); + assert_eq!(size_of::(), 8); + assert_eq!(size_of::(), 104); + assert_eq!(size_of::(), 296); // it also has the utf8 path } @@ -959,3 +994,46 @@ fn spawn_blocking_pool_helper_actually_works() { println!("joined"); }); } + +/// Drop the low bits from a time, to emulate the precision loss in LayerAccessStats +fn lowres_time(hires: SystemTime) -> SystemTime { + let ts = hires.duration_since(UNIX_EPOCH).unwrap().as_secs(); + UNIX_EPOCH + Duration::from_secs(ts) +} + +#[test] +fn access_stats() { + let access_stats = LayerAccessStats::default(); + // Default is visible + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible); + + access_stats.set_visibility(LayerVisibilityHint::Covered); + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered); + access_stats.set_visibility(LayerVisibilityHint::Visible); + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible); + + let rtime = UNIX_EPOCH + Duration::from_secs(2000000000); + access_stats.record_residence_event_at(rtime); + assert_eq!(access_stats.latest_activity(), lowres_time(rtime)); + + let atime = UNIX_EPOCH + Duration::from_secs(2100000000); + access_stats.record_access_at(atime); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); + + // Setting visibility doesn't clobber access time + access_stats.set_visibility(LayerVisibilityHint::Covered); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); + access_stats.set_visibility(LayerVisibilityHint::Visible); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); +} + +#[test] +fn access_stats_2038() { + // The access stats structure uses a timestamp representation that will run out + // of bits in 2038. One year before that, this unit test will start failing. + + let one_year_from_now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap() + + Duration::from_secs(3600 * 24 * 365); + + assert!(one_year_from_now.as_secs() < (2 << 31)); +} diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index a89b66e4a1..cbd18e650f 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -25,7 +25,7 @@ pub struct PersistentLayerDesc { /// /// - For an open in-memory layer, the end bound is MAX_LSN /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the - /// range start + /// range start /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 pub lsn_range: Range, /// Whether this is a delta layer, and also, is this incremental. @@ -41,6 +41,20 @@ pub struct PersistentLayerKey { pub is_delta: bool, } +impl std::fmt::Display for PersistentLayerKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}..{} {}..{} is_delta={}", + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end, + self.is_delta + ) + } +} + impl PersistentLayerDesc { pub fn key(&self) -> PersistentLayerKey { PersistentLayerKey { diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index da26e1eeb7..f33ca076ab 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -248,6 +248,14 @@ impl LayerName { Image(_) => "image", } } + + /// Gets the key range encoded in the layer name. + pub fn key_range(&self) -> &Range { + match &self { + LayerName::Image(layer) => &layer.key_range, + LayerName::Delta(layer) => &layer.key_range, + } + } } impl fmt::Display for LayerName { diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs new file mode 100644 index 0000000000..b4bd976033 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -0,0 +1,563 @@ +use std::{ + cmp::Ordering, + collections::{binary_heap, BinaryHeap}, +}; + +use pageserver_api::key::Key; +use utils::lsn::Lsn; + +use crate::{context::RequestContext, repository::Value}; + +use super::{ + delta_layer::{DeltaLayerInner, DeltaLayerIterator}, + image_layer::{ImageLayerInner, ImageLayerIterator}, +}; + +#[derive(Clone, Copy)] +enum LayerRef<'a> { + Image(&'a ImageLayerInner), + Delta(&'a DeltaLayerInner), +} + +impl<'a> LayerRef<'a> { + fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> { + match self { + Self::Image(x) => LayerIterRef::Image(x.iter(ctx)), + Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)), + } + } +} + +enum LayerIterRef<'a> { + Image(ImageLayerIterator<'a>), + Delta(DeltaLayerIterator<'a>), +} + +impl LayerIterRef<'_> { + async fn next(&mut self) -> anyhow::Result> { + match self { + Self::Delta(x) => x.next().await, + Self::Image(x) => x.next().await, + } + } +} + +/// This type plays several roles at once +/// 1. Unified iterator for image and delta layers. +/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge). +/// 3. Lazy creation of the real delta/image iterator. +enum IteratorWrapper<'a> { + NotLoaded { + ctx: &'a RequestContext, + first_key_lower_bound: (Key, Lsn), + layer: LayerRef<'a>, + }, + Loaded { + iter: PeekableLayerIterRef<'a>, + }, +} + +struct PeekableLayerIterRef<'a> { + iter: LayerIterRef<'a>, + peeked: Option<(Key, Lsn, Value)>, // None == end +} + +impl<'a> PeekableLayerIterRef<'a> { + async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result { + let peeked = iter.next().await?; + Ok(Self { iter, peeked }) + } + + fn peek(&self) -> &Option<(Key, Lsn, Value)> { + &self.peeked + } + + async fn next(&mut self) -> anyhow::Result> { + let result = self.peeked.take(); + self.peeked = self.iter.next().await?; + Ok(result) + } +} + +impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl<'a> std::cmp::Eq for IteratorWrapper<'a> {} + +impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> std::cmp::Ord for IteratorWrapper<'a> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + use std::cmp::Ordering; + let a = self.peek_next_key_lsn_value(); + let b = other.peek_next_key_lsn_value(); + match (a, b) { + (Some((k1, l1, v1)), Some((k2, l2, v2))) => { + fn map_value_to_num(val: &Option<&Value>) -> usize { + match val { + None => 0, + Some(Value::Image(_)) => 1, + Some(Value::WalRecord(_)) => 2, + } + } + let order_1 = map_value_to_num(&v1); + let order_2 = map_value_to_num(&v2); + // When key_lsn are the same, the unloaded iter will always appear before the loaded one. + // And note that we do a reverse at the end of the comparison, so it works with the max heap. + (k1, l1, order_1).cmp(&(k2, l2, order_2)) + } + (Some(_), None) => Ordering::Less, + (None, Some(_)) => Ordering::Greater, + (None, None) => Ordering::Equal, + } + .reverse() + } +} + +impl<'a> IteratorWrapper<'a> { + pub fn create_from_image_layer( + image_layer: &'a ImageLayerInner, + ctx: &'a RequestContext, + ) -> Self { + Self::NotLoaded { + layer: LayerRef::Image(image_layer), + first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()), + ctx, + } + } + + pub fn create_from_delta_layer( + delta_layer: &'a DeltaLayerInner, + ctx: &'a RequestContext, + ) -> Self { + Self::NotLoaded { + layer: LayerRef::Delta(delta_layer), + first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start), + ctx, + } + } + + fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> { + match self { + Self::Loaded { iter } => iter + .peek() + .as_ref() + .map(|(key, lsn, val)| (key, *lsn, Some(val))), + Self::NotLoaded { + first_key_lower_bound: (key, lsn), + .. + } => Some((key, *lsn, None)), + } + } + + // CORRECTNESS: this function must always take `&mut self`, never `&self`. + // + // The reason is that `impl Ord for Self` evaluates differently after this function + // returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when + // the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut` + // and not just `PeekMut::deref` + // If we don't take `&mut self` + async fn load(&mut self) -> anyhow::Result<()> { + assert!(!self.is_loaded()); + let Self::NotLoaded { + ctx, + first_key_lower_bound, + layer, + } = self + else { + unreachable!() + }; + let iter = layer.iter(ctx); + let iter = PeekableLayerIterRef::create(iter).await?; + if let Some((k1, l1, _)) = iter.peek() { + let (k2, l2) = first_key_lower_bound; + debug_assert!((k1, l1) >= (k2, l2)); + } + *self = Self::Loaded { iter }; + Ok(()) + } + + fn is_loaded(&self) -> bool { + matches!(self, Self::Loaded { .. }) + } + + /// Correctness: must load the iterator before using. + /// + /// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it. + /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and + /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. + async fn next(&mut self) -> anyhow::Result> { + let Self::Loaded { iter } = self else { + panic!("must load the iterator before using") + }; + iter.next().await + } +} + +/// A merge iterator over delta/image layer iterators. When duplicated records are +/// found, the iterator will not perform any deduplication, and the caller should handle +/// these situation. By saying duplicated records, there are many possibilities: +/// +/// * Two same delta at the same LSN. +/// * Two same image at the same LSN. +/// * Delta/image at the same LSN where the image has already applied the delta. +/// +/// The iterator will always put the image before the delta. +pub struct MergeIterator<'a> { + heap: BinaryHeap>, +} + +impl<'a> MergeIterator<'a> { + pub fn create( + deltas: &[&'a DeltaLayerInner], + images: &[&'a ImageLayerInner], + ctx: &'a RequestContext, + ) -> Self { + let mut heap = Vec::with_capacity(images.len() + deltas.len()); + for image in images { + heap.push(IteratorWrapper::create_from_image_layer(image, ctx)); + } + for delta in deltas { + heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx)); + } + Self { + heap: BinaryHeap::from(heap), + } + } + + pub async fn next(&mut self) -> anyhow::Result> { + while let Some(mut iter) = self.heap.peek_mut() { + if !iter.is_loaded() { + // Once we load the iterator, we can know the real first key-value pair in the iterator. + // We put it back into the heap so that a potentially unloaded layer may have a key between + // [potential_first_key, loaded_first_key). + iter.load().await?; + continue; + } + let Some(item) = iter.next().await? else { + // If the iterator returns None, we pop this iterator. Actually, in the current implementation, + // we order None > Some, and all the rest of the iterators should return None. + binary_heap::PeekMut::pop(iter); + continue; + }; + return Ok(Some(item)); + } + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use itertools::Itertools; + use pageserver_api::key::Key; + use utils::lsn::Lsn; + + use crate::{ + tenant::{ + harness::{TenantHarness, TIMELINE_ID}, + storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value}, + }, + walrecord::NeonWalRecord, + DEFAULT_PG_VERSION, + }; + + async fn assert_merge_iter_equal( + merge_iter: &mut MergeIterator<'_>, + expect: &[(Key, Lsn, Value)], + ) { + let mut expect_iter = expect.iter(); + loop { + let o1 = merge_iter.next().await.unwrap(); + let o2 = expect_iter.next(); + assert_eq!(o1.is_some(), o2.is_some()); + if o1.is_none() && o2.is_none() { + break; + } + let (k1, l1, v1) = o1.unwrap(); + let (k2, l2, v2) = o2.unwrap(); + assert_eq!(&k1, k2); + assert_eq!(l1, *l2); + assert_eq!(&v1, v2); + } + } + + #[tokio::test] + async fn merge_in_between() { + use crate::repository::Value; + use bytes::Bytes; + + let harness = TenantHarness::create("merge_iterator_merge_in_between") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + let test_deltas1 = vec![ + ( + get_key(0), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ( + get_key(5), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ]; + let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) + .await + .unwrap(); + let test_deltas2 = vec![ + ( + get_key(3), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ( + get_key(4), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ]; + let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) + .await + .unwrap(); + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + let mut expect = Vec::new(); + expect.extend(test_deltas1); + expect.extend(test_deltas2); + expect.sort_by(sort_delta); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + } + + #[tokio::test] + async fn delta_merge() { + use crate::repository::Value; + use bytes::Bytes; + + let harness = TenantHarness::create("merge_iterator_delta_merge") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + const N: usize = 1000; + let test_deltas1 = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10), + Lsn(0x20 * ((idx as u64) % 10 + 1)), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) + .await + .unwrap(); + let test_deltas2 = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10), + Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) + .await + .unwrap(); + let test_deltas3 = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10 + N as u32), + Lsn(0x10 * ((idx as u64) % 10 + 1)), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) + .await + .unwrap(); + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + resident_layer_3.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + let mut expect = Vec::new(); + expect.extend(test_deltas1); + expect.extend(test_deltas2); + expect.extend(test_deltas3); + expect.sort_by(sort_delta); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + + // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge + } + + #[tokio::test] + async fn delta_image_mixed_merge() { + use crate::repository::Value; + use bytes::Bytes; + + let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + // In this test case, we want to test if the iterator still works correctly with multiple copies + // of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab. + // Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix. + // An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation + // could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation + // one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should + // correctly process these situations and return everything as-is, and the upper layer of the system + // will handle duplicated LSNs. + let test_deltas1 = vec![ + ( + get_key(0), + Lsn(0x10), + Value::WalRecord(NeonWalRecord::wal_init()), + ), + ( + get_key(0), + Lsn(0x18), + Value::WalRecord(NeonWalRecord::wal_append("a")), + ), + ( + get_key(5), + Lsn(0x10), + Value::WalRecord(NeonWalRecord::wal_init()), + ), + ( + get_key(5), + Lsn(0x18), + Value::WalRecord(NeonWalRecord::wal_append("b")), + ), + ]; + let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) + .await + .unwrap(); + let mut test_deltas2 = test_deltas1.clone(); + test_deltas2.push(( + get_key(10), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + )); + let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) + .await + .unwrap(); + let test_deltas3 = vec![ + ( + get_key(0), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"")), + ), + ( + get_key(5), + Lsn(0x18), + Value::Image(Bytes::copy_from_slice(b"b")), + ), + ( + get_key(15), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ]; + let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) + .await + .unwrap(); + let mut test_deltas4 = test_deltas3.clone(); + test_deltas4.push(( + get_key(20), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + )); + let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx) + .await + .unwrap(); + let mut expect = Vec::new(); + expect.extend(test_deltas1); + expect.extend(test_deltas2); + expect.extend(test_deltas3); + expect.extend(test_deltas4); + expect.sort_by(sort_delta_value); + + // Test with different layer order for MergeIterator::create to ensure the order + // is stable. + + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_4.get_as_delta(&ctx).await.unwrap(), + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + resident_layer_3.get_as_delta(&ctx).await.unwrap(), + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + resident_layer_4.get_as_delta(&ctx).await.unwrap(), + resident_layer_3.get_as_delta(&ctx).await.unwrap(), + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + + is_send(merge_iter); + } + + fn is_send(_: impl Send) {} +} diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs new file mode 100644 index 0000000000..e12e29cd45 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -0,0 +1,459 @@ +use std::{ops::Range, sync::Arc}; + +use bytes::Bytes; +use pageserver_api::key::{Key, KEY_SIZE}; +use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId}; + +use crate::tenant::storage_layer::Layer; +use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline}; + +use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer}; + +/// An image writer that takes images and produces multiple image layers. The interface does not +/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files +/// to be cleaned up) +#[must_use] +pub struct SplitImageLayerWriter { + inner: ImageLayerWriter, + target_layer_size: u64, + generated_layers: Vec, + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + lsn: Lsn, +} + +impl SplitImageLayerWriter { + pub async fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + start_key: Key, + lsn: Lsn, + target_layer_size: u64, + ctx: &RequestContext, + ) -> anyhow::Result { + Ok(Self { + target_layer_size, + inner: ImageLayerWriter::new( + conf, + timeline_id, + tenant_shard_id, + &(start_key..Key::MAX), + lsn, + ctx, + ) + .await?, + generated_layers: Vec::new(), + conf, + timeline_id, + tenant_shard_id, + lsn, + }) + } + + pub async fn put_image( + &mut self, + key: Key, + img: Bytes, + tline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + // The current estimation is an upper bound of the space that the key/image could take + // because we did not consider compression in this estimation. The resulting image layer + // could be smaller than the target size. + let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64; + if self.inner.num_keys() >= 1 + && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size + { + let next_image_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(key..Key::MAX), + self.lsn, + ctx, + ) + .await?; + let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); + self.generated_layers.push( + prev_image_writer + .finish_with_end_key(tline, key, ctx) + .await?, + ); + } + self.inner.put_image(key, img, ctx).await + } + + pub(crate) async fn finish( + self, + tline: &Arc, + ctx: &RequestContext, + end_key: Key, + ) -> anyhow::Result> { + let Self { + mut generated_layers, + inner, + .. + } = self; + generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?); + Ok(generated_layers) + } + + /// When split writer fails, the caller should call this function and handle partially generated layers. + #[allow(dead_code)] + pub(crate) async fn take(self) -> anyhow::Result<(Vec, ImageLayerWriter)> { + Ok((self.generated_layers, self.inner)) + } +} + +/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not +/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files +/// to be cleaned up). +#[must_use] +pub struct SplitDeltaLayerWriter { + inner: DeltaLayerWriter, + target_layer_size: u64, + generated_layers: Vec, + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + lsn_range: Range, +} + +impl SplitDeltaLayerWriter { + pub async fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + start_key: Key, + lsn_range: Range, + target_layer_size: u64, + ctx: &RequestContext, + ) -> anyhow::Result { + Ok(Self { + target_layer_size, + inner: DeltaLayerWriter::new( + conf, + timeline_id, + tenant_shard_id, + start_key, + lsn_range.clone(), + ctx, + ) + .await?, + generated_layers: Vec::new(), + conf, + timeline_id, + tenant_shard_id, + lsn_range, + }) + } + + pub async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + tline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate + // number, and therefore the final layer size could be a little bit larger or smaller than the target. + let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */; + if self.inner.num_keys() >= 1 + && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size + { + let next_delta_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + self.lsn_range.clone(), + ctx, + ) + .await?; + let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer); + let (desc, path) = prev_delta_writer.finish(key, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + self.generated_layers.push(delta_layer); + } + self.inner.put_value(key, lsn, val, ctx).await + } + + pub(crate) async fn finish( + self, + tline: &Arc, + ctx: &RequestContext, + end_key: Key, + ) -> anyhow::Result> { + let Self { + mut generated_layers, + inner, + .. + } = self; + + let (desc, path) = inner.finish(end_key, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + generated_layers.push(delta_layer); + Ok(generated_layers) + } + + /// When split writer fails, the caller should call this function and handle partially generated layers. + #[allow(dead_code)] + pub(crate) async fn take(self) -> anyhow::Result<(Vec, DeltaLayerWriter)> { + Ok((self.generated_layers, self.inner)) + } +} + +#[cfg(test)] +mod tests { + use rand::{RngCore, SeedableRng}; + + use crate::{ + tenant::{ + harness::{TenantHarness, TIMELINE_ID}, + storage_layer::AsLayerDesc, + }, + DEFAULT_PG_VERSION, + }; + + use super::*; + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + fn get_img(id: u32) -> Bytes { + format!("{id:064}").into() + } + + fn get_large_img() -> Bytes { + let mut rng = rand::rngs::SmallRng::seed_from_u64(42); + let mut data = vec![0; 8192]; + rng.fill_bytes(&mut data); + data.into() + } + + #[tokio::test] + async fn write_one_image() { + let harness = TenantHarness::create("split_writer_write_one_image") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let mut image_writer = SplitImageLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + + image_writer + .put_image(get_key(0), get_img(0), &tline, &ctx) + .await + .unwrap(); + let layers = image_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 1); + + delta_writer + .put_value( + get_key(0), + Lsn(0x18), + Value::Image(get_img(0)), + &tline, + &ctx, + ) + .await + .unwrap(); + let layers = delta_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 1); + } + + #[tokio::test] + async fn write_split() { + let harness = TenantHarness::create("split_writer_write_split") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let mut image_writer = SplitImageLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + const N: usize = 2000; + for i in 0..N { + let i = i as u32; + image_writer + .put_image(get_key(i), get_large_img(), &tline, &ctx) + .await + .unwrap(); + delta_writer + .put_value( + get_key(i), + Lsn(0x20), + Value::Image(get_large_img()), + &tline, + &ctx, + ) + .await + .unwrap(); + } + let image_layers = image_writer + .finish(&tline, &ctx, get_key(N as u32)) + .await + .unwrap(); + let delta_layers = delta_writer + .finish(&tline, &ctx, get_key(N as u32)) + .await + .unwrap(); + assert_eq!(image_layers.len(), N / 512 + 1); + assert_eq!(delta_layers.len(), N / 512 + 1); + for idx in 0..image_layers.len() { + assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN); + assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX); + assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN); + assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX); + if idx > 0 { + assert_eq!( + image_layers[idx - 1].layer_desc().key_range.end, + image_layers[idx].layer_desc().key_range.start + ); + assert_eq!( + delta_layers[idx - 1].layer_desc().key_range.end, + delta_layers[idx].layer_desc().key_range.start + ); + } + } + } + + #[tokio::test] + async fn write_large_img() { + let harness = TenantHarness::create("split_writer_write_large_img") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let mut image_writer = SplitImageLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18), + 4 * 1024, + &ctx, + ) + .await + .unwrap(); + + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024, + &ctx, + ) + .await + .unwrap(); + + image_writer + .put_image(get_key(0), get_img(0), &tline, &ctx) + .await + .unwrap(); + image_writer + .put_image(get_key(1), get_large_img(), &tline, &ctx) + .await + .unwrap(); + let layers = image_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 2); + + delta_writer + .put_value( + get_key(0), + Lsn(0x18), + Value::Image(get_img(0)), + &tline, + &ctx, + ) + .await + .unwrap(); + delta_writer + .put_value( + get_key(1), + Lsn(0x1A), + Value::Image(get_large_img()), + &tline, + &ctx, + ) + .await + .unwrap(); + let layers = delta_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 2); + } +} diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index d679b78f32..12f080f3c1 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -61,21 +61,12 @@ impl BackgroundLoopKind { } } -static PERMIT_GAUGES: once_cell::sync::Lazy< - enum_map::EnumMap, -> = once_cell::sync::Lazy::new(|| { - enum_map::EnumMap::from_array(std::array::from_fn(|i| { - let kind = ::from_usize(i); - crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()]) - })) -}); - /// Cancellation safe. pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, ) -> tokio::sync::SemaphorePermit<'static> { - let _guard = PERMIT_GAUGES[loop_kind].guard(); + let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind); pausable_failpoint!( "initial-size-calculation-permit-pause", @@ -98,10 +89,9 @@ pub fn start_background_loops( task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, - Some(tenant_shard_id), + tenant_shard_id, None, &format!("compactor for tenant {tenant_shard_id}"), - false, { let tenant = Arc::clone(tenant); let background_jobs_can_start = background_jobs_can_start.cloned(); @@ -122,10 +112,9 @@ pub fn start_background_loops( task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::GarbageCollector, - Some(tenant_shard_id), + tenant_shard_id, None, &format!("garbage collector for tenant {tenant_shard_id}"), - false, { let tenant = Arc::clone(tenant); let background_jobs_can_start = background_jobs_can_start.cloned(); @@ -146,10 +135,9 @@ pub fn start_background_loops( task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::IngestHousekeeping, - Some(tenant_shard_id), + tenant_shard_id, None, &format!("ingest housekeeping for tenant {tenant_shard_id}"), - false, { let tenant = Arc::clone(tenant); let background_jobs_can_start = background_jobs_can_start.cloned(); @@ -213,24 +201,28 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { Duration::from_secs(10) } else { // Run compaction - if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - log_compaction_error( - &e, - error_run_count, - &wait_duration, - cancel.is_cancelled(), - ); - wait_duration - } else { - error_run_count = 0; - period + match tenant.compaction_iteration(&cancel, &ctx).await { + Ok(has_pending_task) => { + error_run_count = 0; + // schedule the next compaction immediately in case there is a pending compaction task + if has_pending_task { Duration::ZERO } else { period } + } + Err(e) => { + let wait_duration = backoff::exponential_backoff_duration_seconds( + error_run_count + 1, + 1.0, + MAX_BACKOFF_SECS, + ); + error_run_count += 1; + let wait_duration = Duration::from_secs_f64(wait_duration); + log_compaction_error( + &e, + error_run_count, + &wait_duration, + cancel.is_cancelled(), + ); + wait_duration + } } }; @@ -264,7 +256,8 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { count_throttled, sum_throttled_usecs, allowed_rps=%format_args!("{allowed_rps:.0}"), - "shard was throttled in the last n_seconds") + "shard was throttled in the last n_seconds" + ); }); // Sleep @@ -364,14 +357,13 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { if first { first = false; - if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel) - .await - .is_err() - { - break; - } + let delays = async { + delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?; + random_init_delay(period, &cancel).await?; + Ok::<_, Cancelled>(()) + }; - if random_init_delay(period, &cancel).await.is_err() { + if delays.await.is_err() { break; } } @@ -406,9 +398,16 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { error_run_count += 1; let wait_duration = Duration::from_secs_f64(wait_duration); - error!( - "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}", - ); + if matches!(e, crate::tenant::GcError::TimelineCancelled) { + // Timeline was cancelled during gc. We might either be in an event + // that affects the entire tenant (tenant deletion, pageserver shutdown), + // or in one that affects the timeline only (timeline deletion). + // Therefore, don't exit the loop. + info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); + } else { + error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); + } + wait_duration } } @@ -416,7 +415,6 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); - // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) .await .is_ok() diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a4f1108635..e90f65942f 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,8 +1,9 @@ pub(crate) mod analysis; -mod compaction; +pub(crate) mod compaction; pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; +pub(crate) mod handle; mod init; pub mod layer_manager; pub(crate) mod logical_size; @@ -14,12 +15,14 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use arc_swap::ArcSwap; use bytes::Bytes; use camino::Utf8Path; +use chrono::{DateTime, Utc}; use enumset::EnumSet; use fail::fail_point; +use handle::ShardTimelineId; use once_cell::sync::Lazy; use pageserver_api::{ key::{ - AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, + CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, }, keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, @@ -41,10 +44,8 @@ use tokio::{ use tokio_util::sync::CancellationToken; use tracing::*; use utils::{ - bin_ser::BeSer, fs_ext, pausable_failpoint, sync::gate::{Gate, GateGuard}, - vec_map::VecMap, }; use std::pin::pin; @@ -56,23 +57,21 @@ use std::{ collections::{BTreeMap, HashMap, HashSet}, sync::atomic::AtomicU64, }; -use std::{ - cmp::{max, min, Ordering}, - ops::ControlFlow, -}; +use std::{cmp::min, ops::ControlFlow}; use std::{ collections::btree_map::Entry, ops::{Deref, Range}, }; -use crate::metrics::GetKind; -use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS; use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ + config::defaults::DEFAULT_PITR_INTERVAL, layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, + storage_layer::PersistentLayerDesc, }, + walredo, }; use crate::{ context::{DownloadBehavior, RequestContext}, @@ -83,17 +82,22 @@ use crate::{ disk_usage_eviction_task::finite_f32, tenant::storage_layer::{ AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult, - ValueReconstructState, ValuesReconstructState, + LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState, + ValuesReconstructState, }, }; use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; +use crate::{ + l0_flush::{self, L0FlushGlobalState}, + metrics::GetKind, +}; use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey}; use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, virtual_file::{MaybeFatalIo, VirtualFile}, @@ -101,9 +105,7 @@ use crate::{ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::{ - TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT, -}; +use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use crate::tenant::config::TenantConfOpt; use pageserver_api::reltag::RelTag; @@ -120,7 +122,6 @@ use utils::{ simple_rcu::{Rcu, RcuReadGuard}, }; -use crate::page_cache; use crate::repository::GcResult; use crate::repository::{Key, Value}; use crate::task_mgr; @@ -134,10 +135,16 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::{config::TenantConf, storage_layer::VectoredValueReconstructState}; +use super::{ + config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint, + upload_queue::NotInitialized, +}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; -use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer}; +use super::{ + remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError, + storage_layer::ReadableLayer, +}; use super::{ secondary::heatmap::{HeatMapLayer, HeatMapTimeline}, GcError, @@ -174,28 +181,9 @@ impl std::fmt::Display for ImageLayerCreationMode { } } -/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap -#[derive(Debug, Clone, PartialEq, Eq)] -pub(crate) struct Hole { - key_range: Range, - coverage_size: usize, -} - -impl Ord for Hole { - fn cmp(&self, other: &Self) -> Ordering { - other.coverage_size.cmp(&self.coverage_size) // inverse order - } -} - -impl PartialOrd for Hole { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. /// Can be removed after all refactors are done. -fn drop_rlock(rlock: tokio::sync::OwnedRwLockReadGuard) { +fn drop_rlock(rlock: tokio::sync::RwLockReadGuard) { drop(rlock) } @@ -211,6 +199,7 @@ pub struct TimelineResources { pub timeline_get_throttle: Arc< crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, >, + pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } pub(crate) struct AuxFilesState { @@ -267,7 +256,7 @@ pub struct Timeline { /// /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`, /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`. - pub(crate) layers: Arc>, + pub(crate) layers: tokio::sync::RwLock, last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. @@ -363,6 +352,7 @@ pub struct Timeline { repartition_threshold: u64, last_image_layer_creation_check_at: AtomicLsn, + last_image_layer_creation_check_instant: std::sync::Mutex>, /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, @@ -436,6 +426,10 @@ pub struct Timeline { /// in the future, add `extra_test_sparse_keyspace` if necessary. #[cfg(test)] pub(crate) extra_test_dense_keyspace: ArcSwap, + + pub(crate) l0_flush_global_state: L0FlushGlobalState, + + pub(crate) handles: handle::PerTimelineState, } pub struct WalReceiverInfo { @@ -453,52 +447,59 @@ pub(crate) struct GcInfo { /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub(crate) retain_lsns: Vec, + pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>, /// The cutoff coordinates, which are combined by selecting the minimum. pub(crate) cutoffs: GcCutoffs, /// Leases granted to particular LSNs. pub(crate) leases: BTreeMap, + + /// Whether our branch point is within our ancestor's PITR interval (for cost estimation) + pub(crate) within_ancestor_pitr: bool, } impl GcInfo { pub(crate) fn min_cutoff(&self) -> Lsn { self.cutoffs.select_min() } + + pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) { + self.retain_lsns.push((child_lsn, child_id)); + self.retain_lsns.sort_by_key(|i| i.0); + } + + pub(super) fn remove_child(&mut self, child_id: TimelineId) { + self.retain_lsns.retain(|i| i.1 != child_id); + } } -/// The `GcInfo` component describing which Lsns need to be retained. -#[derive(Debug)] +/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this +/// is a single number (the oldest LSN which we must retain), but it internally distinguishes +/// between time-based and space-based retention for observability and consumption metrics purposes. +#[derive(Debug, Clone)] pub(crate) struct GcCutoffs { - /// Keep everything newer than this point. - /// - /// This is calculated by subtracting 'gc_horizon' setting from - /// last-record LSN - /// - /// FIXME: is this inclusive or exclusive? - pub(crate) horizon: Lsn, + /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much + /// history we must keep to retain a specified number of bytes of WAL. + pub(crate) space: Lsn, - /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this - /// point. - /// - /// This is calculated by finding a number such that a record is needed for PITR - /// if only if its LSN is larger than 'pitr_cutoff'. - pub(crate) pitr: Lsn, + /// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much + /// history we must keep to enable reading back at least the PITR interval duration. + pub(crate) time: Lsn, } impl Default for GcCutoffs { fn default() -> Self { Self { - horizon: Lsn::INVALID, - pitr: Lsn::INVALID, + space: Lsn::INVALID, + time: Lsn::INVALID, } } } impl GcCutoffs { fn select_min(&self) -> Lsn { - std::cmp::min(self.horizon, self.pitr) + std::cmp::min(self.space, self.time) } } @@ -511,7 +512,7 @@ pub(crate) struct TimelineVisitOutcome { #[derive(thiserror::Error, Debug)] pub(crate) enum PageReconstructError { #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), #[error("Ancestor LSN wait error: {0}")] AncestorLsnTimeout(WaitLsnError), @@ -527,6 +528,28 @@ pub(crate) enum PageReconstructError { MissingKey(MissingKeyError), } +impl From for PageReconstructError { + fn from(value: anyhow::Error) -> Self { + // with walingest.rs many PageReconstructError are wrapped in as anyhow::Error + match value.downcast::() { + Ok(pre) => pre, + Err(other) => PageReconstructError::Other(other), + } + } +} + +impl From for PageReconstructError { + fn from(value: utils::bin_ser::DeserializeError) -> Self { + PageReconstructError::Other(anyhow::Error::new(value).context("deserialization failure")) + } +} + +impl From for PageReconstructError { + fn from(_: layer_manager::Shutdown) -> Self { + PageReconstructError::Cancelled + } +} + impl GetVectoredError { #[cfg(test)] pub(crate) fn is_missing_key_error(&self) -> bool { @@ -534,17 +557,28 @@ impl GetVectoredError { } } -#[derive(Debug)] +impl From for GetVectoredError { + fn from(_: layer_manager::Shutdown) -> Self { + GetVectoredError::Cancelled + } +} + +#[derive(thiserror::Error)] pub struct MissingKeyError { key: Key, shard: ShardNumber, cont_lsn: Lsn, request_lsn: Lsn, ancestor_lsn: Option, - traversal_path: Vec, backtrace: Option, } +impl std::fmt::Debug for MissingKeyError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self) + } +} + impl std::fmt::Display for MissingKeyError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -556,18 +590,6 @@ impl std::fmt::Display for MissingKeyError { write!(f, ", ancestor {}", ancestor_lsn)?; } - if !self.traversal_path.is_empty() { - writeln!(f)?; - } - - for (r, c, l) in &self.traversal_path { - writeln!( - f, - "layer traversal: result {:?}, cont_lsn {}, layer: {}", - r, c, l, - )?; - } - if let Some(ref backtrace) = self.backtrace { write!(f, "\n{}", backtrace)?; } @@ -581,11 +603,8 @@ impl PageReconstructError { pub(crate) fn is_stopping(&self) -> bool { use PageReconstructError::*; match self { - Other(_) => false, - AncestorLsnTimeout(_) => false, Cancelled => true, - WalRedo(_) => false, - MissingKey { .. } => false, + Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false, } } } @@ -595,16 +614,22 @@ pub(crate) enum CreateImageLayersError { #[error("timeline shutting down")] Cancelled, - #[error(transparent)] - GetVectoredError(GetVectoredError), + #[error("read failed")] + GetVectoredError(#[source] GetVectoredError), - #[error(transparent)] - PageReconstructError(PageReconstructError), + #[error("reconstruction failed")] + PageReconstructError(#[source] PageReconstructError), #[error(transparent)] Other(#[from] anyhow::Error), } +impl From for CreateImageLayersError { + fn from(_: layer_manager::Shutdown) -> Self { + CreateImageLayersError::Cancelled + } +} + #[derive(thiserror::Error, Debug, Clone)] pub(crate) enum FlushLayerError { /// Timeline cancellation token was cancelled @@ -617,10 +642,10 @@ pub(crate) enum FlushLayerError { // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush // loop via a watch channel, where we can only borrow it. - #[error(transparent)] + #[error("create image layers (shared)")] CreateImageLayersError(Arc), - #[error(transparent)] + #[error("other (shared)")] Other(#[from] Arc), } @@ -628,7 +653,13 @@ impl FlushLayerError { // When crossing from generic anyhow errors to this error type, we explicitly check // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err. fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self { - if timeline.cancel.is_cancelled() { + let cancelled = timeline.cancel.is_cancelled() + // The upload queue might have been shut down before the official cancellation of the timeline. + || err + .downcast_ref::() + .map(NotInitialized::is_stopping) + .unwrap_or_default(); + if cancelled { Self::Cancelled } else { Self::Other(Arc::new(err)) @@ -636,39 +667,57 @@ impl FlushLayerError { } } +impl From for FlushLayerError { + fn from(_: layer_manager::Shutdown) -> Self { + FlushLayerError::Cancelled + } +} + #[derive(thiserror::Error, Debug)] pub(crate) enum GetVectoredError { #[error("timeline shutting down")] Cancelled, - #[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)] + #[error("requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)] Oversized(u64), - #[error("Requested at invalid LSN: {0}")] + #[error("requested at invalid LSN: {0}")] InvalidLsn(Lsn), - #[error("Requested key not found: {0}")] + #[error("requested key not found: {0}")] MissingKey(MissingKeyError), - #[error(transparent)] - GetReadyAncestorError(GetReadyAncestorError), + #[error("ancestry walk")] + GetReadyAncestorError(#[source] GetReadyAncestorError), #[error(transparent)] Other(#[from] anyhow::Error), } +impl From for GetVectoredError { + fn from(value: GetReadyAncestorError) -> Self { + use GetReadyAncestorError::*; + match value { + Cancelled => GetVectoredError::Cancelled, + AncestorLsnTimeout(_) | BadState { .. } => { + GetVectoredError::GetReadyAncestorError(value) + } + } + } +} + #[derive(thiserror::Error, Debug)] pub(crate) enum GetReadyAncestorError { - #[error("Ancestor LSN wait error: {0}")] + #[error("ancestor LSN wait error")] AncestorLsnTimeout(#[from] WaitLsnError), - #[error("Bad state on timeline {timeline_id}: {state:?}")] + #[error("bad state on timeline {timeline_id}: {state:?}")] BadState { timeline_id: TimelineId, state: TimelineState, }, - #[error("Cancelled")] + #[error("cancelled")] Cancelled, } @@ -689,6 +738,8 @@ pub enum GetLogicalSizePriority { pub(crate) enum CompactFlags { ForceRepartition, ForceImageLayerCreation, + EnhancedGcBottomMostCompaction, + DryRun, } impl std::fmt::Debug for Timeline { @@ -719,6 +770,9 @@ impl From for CompactionError { fn from(e: CreateImageLayersError) -> Self { match e { CreateImageLayersError::Cancelled => CompactionError::ShuttingDown, + CreateImageLayersError::Other(e) => { + CompactionError::Other(e.context("create image layers")) + } _ => CompactionError::Other(e.into()), } } @@ -775,40 +829,6 @@ impl From for PageReconstructError { } } -#[derive( - Eq, - PartialEq, - Debug, - Copy, - Clone, - strum_macros::EnumString, - strum_macros::Display, - serde_with::DeserializeFromStr, - serde_with::SerializeDisplay, -)] -#[strum(serialize_all = "kebab-case")] -pub enum GetVectoredImpl { - Sequential, - Vectored, -} - -#[derive( - Eq, - PartialEq, - Debug, - Copy, - Clone, - strum_macros::EnumString, - strum_macros::Display, - serde_with::DeserializeFromStr, - serde_with::SerializeDisplay, -)] -#[strum(serialize_all = "kebab-case")] -pub enum GetImpl { - Legacy, - Vectored, -} - pub(crate) enum WaitLsnWaiter<'a> { Timeline(&'a Timeline), Tenant, @@ -847,6 +867,18 @@ impl Timeline { .map(|ancestor| ancestor.timeline_id) } + /// Get the bytes written since the PITR cutoff on this branch, and + /// whether this branch's ancestor_lsn is within its parent's PITR. + pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) { + let gc_info = self.gc_info.read().unwrap(); + let history = self + .get_last_record_lsn() + .checked_sub(gc_info.cutoffs.time) + .unwrap_or(Lsn(0)) + .0; + (history, gc_info.within_ancestor_pitr) + } + /// Lock and get timeline's GC cutoff pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { self.latest_gc_cutoff_lsn.read() @@ -887,145 +919,46 @@ impl Timeline { self.timeline_get_throttle.throttle(ctx, 1).await; - // Check the page cache. We will get back the most recent page with lsn <= `lsn`. - // The cached image can be returned directly if there is no WAL between the cached image - // and requested LSN. The cached image can also be used to reduce the amount of WAL needed - // for redo. - let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await { - Some((cached_lsn, cached_img)) => { - match cached_lsn.cmp(&lsn) { - Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => { - MATERIALIZED_PAGE_CACHE_HIT_DIRECT.inc(); - return Ok(cached_img); // exact LSN match, return the image - } - Ordering::Greater => { - unreachable!("the returned lsn should never be after the requested lsn") - } - } - Some((cached_lsn, cached_img)) - } - None => None, + let keyspace = KeySpace { + ranges: vec![key..key.next()], }; - match self.conf.get_impl { - GetImpl::Legacy => { - let reconstruct_state = ValueReconstructState { - records: Vec::new(), - img: cached_page_img, - }; + // Initialise the reconstruct state for the key with the cache + // entry returned above. + let mut reconstruct_state = ValuesReconstructState::new(); - self.get_impl(key, lsn, reconstruct_state, ctx).await - } - GetImpl::Vectored => { - let keyspace = KeySpace { - ranges: vec![key..key.next()], - }; + let vectored_res = self + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .await; - // Initialise the reconstruct state for the key with the cache - // entry returned above. - let mut reconstruct_state = ValuesReconstructState::new(); - - // Only add the cached image to the reconstruct state when it exists. - if cached_page_img.is_some() { - let mut key_state = VectoredValueReconstructState::default(); - key_state.img = cached_page_img; - reconstruct_state.keys.insert(key, Ok(key_state)); - } - - let vectored_res = self - .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) - .await; - - if self.conf.validate_vectored_get { - self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) - .await; - } - - let key_value = vectored_res?.pop_first(); - match key_value { - Some((got_key, value)) => { - if got_key != key { - error!( - "Expected {}, but singular vectored get returned {}", - key, got_key - ); - Err(PageReconstructError::Other(anyhow!( - "Singular vectored get returned wrong key" - ))) - } else { - value - } - } - None => Err(PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn: Lsn(0), - request_lsn: lsn, - ancestor_lsn: None, - traversal_path: Vec::new(), - backtrace: None, - })), + let key_value = vectored_res?.pop_first(); + match key_value { + Some((got_key, value)) => { + if got_key != key { + error!( + "Expected {}, but singular vectored get returned {}", + key, got_key + ); + Err(PageReconstructError::Other(anyhow!( + "Singular vectored get returned wrong key" + ))) + } else { + value } } + None => Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn: Lsn(0), + request_lsn: lsn, + ancestor_lsn: None, + backtrace: None, + })), } } - /// Not subject to [`Self::timeline_get_throttle`]. - async fn get_impl( - &self, - key: Key, - lsn: Lsn, - mut reconstruct_state: ValueReconstructState, - ctx: &RequestContext, - ) -> Result { - // XXX: structured stats collection for layer eviction here. - trace!( - "get page request for {}@{} from task kind {:?}", - key, - lsn, - ctx.task_kind() - ); - - let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME - .for_get_kind(GetKind::Singular) - .start_timer(); - let path = self - .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) - .await?; - timer.stop_and_record(); - - let start = Instant::now(); - let res = self.reconstruct_value(key, lsn, reconstruct_state).await; - let elapsed = start.elapsed(); - crate::metrics::RECONSTRUCT_TIME - .for_get_kind(GetKind::Singular) - .observe(elapsed.as_secs_f64()); - - if cfg!(feature = "testing") && res.is_err() { - // it can only be walredo issue - use std::fmt::Write; - - let mut msg = String::new(); - - path.into_iter().for_each(|(res, cont_lsn, layer)| { - writeln!( - msg, - "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}", - layer, - ) - .expect("string grows") - }); - - // this is to rule out or provide evidence that we could in some cases read a duplicate - // walrecord - tracing::info!("walredo failed, path:\n{msg}"); - } - - res - } - pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; + pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0; /// Look up multiple page versions at a given LSN /// @@ -1055,11 +988,10 @@ impl Timeline { } trace!( - "get vectored request for {:?}@{} from task kind {:?} will use {} implementation", + "get vectored request for {:?}@{} from task kind {:?}", keyspace, lsn, ctx.task_kind(), - self.conf.get_vectored_impl ); let start = crate::metrics::GET_VECTORED_LATENCY @@ -1073,28 +1005,14 @@ impl Timeline { .throttle(ctx, key_count as usize) .await; - let res = match self.conf.get_vectored_impl { - GetVectoredImpl::Sequential => { - self.get_vectored_sequential_impl(keyspace, lsn, ctx).await - } - GetVectoredImpl::Vectored => { - let vectored_res = self - .get_vectored_impl( - keyspace.clone(), - lsn, - &mut ValuesReconstructState::new(), - ctx, - ) - .await; - - if self.conf.validate_vectored_get { - self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) - .await; - } - - vectored_res - } - }; + let res = self + .get_vectored_impl( + keyspace.clone(), + lsn, + &mut ValuesReconstructState::new(), + ctx, + ) + .await; if let Some((metric, start)) = start { let elapsed = start.elapsed(); @@ -1127,7 +1045,6 @@ impl Timeline { /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that /// the scan operation will not cause OOM in the future. - #[allow(dead_code)] pub(crate) async fn scan( &self, keyspace: KeySpace, @@ -1184,65 +1101,6 @@ impl Timeline { vectored_res } - /// Not subject to [`Self::timeline_get_throttle`]. - pub(super) async fn get_vectored_sequential_impl( - &self, - keyspace: KeySpace, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result>, GetVectoredError> { - let mut values = BTreeMap::new(); - - for range in keyspace.ranges { - let mut key = range.start; - while key != range.end { - let block = self - .get_impl(key, lsn, ValueReconstructState::default(), ctx) - .await; - - use PageReconstructError::*; - match block { - Err(Cancelled) => return Err(GetVectoredError::Cancelled), - Err(MissingKey(_)) - if NON_INHERITED_RANGE.contains(&key) - || NON_INHERITED_SPARSE_RANGE.contains(&key) => - { - // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range. - // When we add more types of keys into the page server, we should revisit this part of code and throw errors - // accordingly. - key = key.next(); - } - Err(MissingKey(err)) => { - return Err(GetVectoredError::MissingKey(err)); - } - Err(Other(err)) - if err - .to_string() - .contains("downloading evicted layer file failed") => - { - return Err(GetVectoredError::Other(err)) - } - Err(Other(err)) - if err - .chain() - .any(|cause| cause.to_string().contains("layer loading failed")) => - { - // The intent here is to achieve error parity with the vectored read path. - // When vectored read fails to load a layer it fails the whole read, hence - // we mimic this behaviour here to keep the validation happy. - return Err(GetVectoredError::Other(err)); - } - _ => { - values.insert(key, block); - key = key.next(); - } - } - } - } - - Ok(values) - } - pub(super) async fn get_vectored_impl( &self, keyspace: KeySpace, @@ -1259,7 +1117,7 @@ impl Timeline { let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME .for_get_kind(get_kind) .start_timer(); - self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx) + self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) .await?; get_data_timer.stop_and_record(); @@ -1289,123 +1147,30 @@ impl Timeline { // (this is a requirement, not a bug). Skip updating the metric in these cases // to avoid infinite results. if !results.is_empty() { + let avg = layers_visited as f64 / results.len() as f64; + if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + tracing::info!( + shard_id = %self.tenant_shard_id.shard_slug(), + lsn = %lsn, + "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned", + keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size()); + }); + } + // Note that this is an approximation. Tracking the exact number of layers visited // per key requires virtually unbounded memory usage and is inefficient // (i.e. segment tree tracking each range queried from a layer) - crate::metrics::VEC_READ_NUM_LAYERS_VISITED - .observe(layers_visited as f64 / results.len() as f64); + crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg); } Ok(results) } - /// Not subject to [`Self::timeline_get_throttle`]. - pub(super) async fn validate_get_vectored_impl( - &self, - vectored_res: &Result>, GetVectoredError>, - keyspace: KeySpace, - lsn: Lsn, - ctx: &RequestContext, - ) { - if keyspace.overlaps(&Key::metadata_key_range()) { - // skip validation for metadata key range - return; - } - - let sequential_res = self - .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx) - .await; - - fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool { - use GetVectoredError::*; - match (lhs, rhs) { - (Oversized(l), Oversized(r)) => l == r, - (InvalidLsn(l), InvalidLsn(r)) => l == r, - (MissingKey(l), MissingKey(r)) => l.key == r.key, - (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true, - (Other(_), Other(_)) => true, - _ => false, - } - } - - match (&sequential_res, vectored_res) { - (Err(GetVectoredError::Cancelled), _) => {}, - (_, Err(GetVectoredError::Cancelled)) => {}, - (Err(seq_err), Ok(_)) => { - panic!(concat!("Sequential get failed with {}, but vectored get did not", - " - keyspace={:?} lsn={}"), - seq_err, keyspace, lsn) }, - (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => { - // Sequential get runs after vectored get, so it is possible for the later - // to time out while waiting for its ancestor's Lsn to become ready and for the - // former to succeed (it essentially has a doubled wait time). - }, - (Ok(_), Err(vec_err)) => { - panic!(concat!("Vectored get failed with {}, but sequential get did not", - " - keyspace={:?} lsn={}"), - vec_err, keyspace, lsn) }, - (Err(seq_err), Err(vec_err)) => { - assert!(errors_match(seq_err, vec_err), - "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")}, - (Ok(seq_values), Ok(vec_values)) => { - seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| { - assert_eq!(seq_key, vec_key); - match (seq_res, vec_res) { - (Ok(seq_blob), Ok(vec_blob)) => { - Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob); - }, - (Err(err), Ok(_)) => { - panic!( - concat!("Sequential get failed with {} for key {}, but vectored get did not", - " - keyspace={:?} lsn={}"), - err, seq_key, keyspace, lsn) }, - (Ok(_), Err(err)) => { - panic!( - concat!("Vectored get failed with {} for key {}, but sequential get did not", - " - keyspace={:?} lsn={}"), - err, seq_key, keyspace, lsn) }, - (Err(_), Err(_)) => {} - } - }) - } - } - } - - fn validate_key_equivalence( - key: &Key, - keyspace: &KeySpace, - lsn: Lsn, - seq: &Bytes, - vec: &Bytes, - ) { - if *key == AUX_FILES_KEY { - // The value reconstruct of AUX_FILES_KEY from records is not deterministic - // since it uses a hash map under the hood. Hence, deserialise both results - // before comparing. - let seq_aux_dir_res = AuxFilesDirectory::des(seq); - let vec_aux_dir_res = AuxFilesDirectory::des(vec); - match (&seq_aux_dir_res, &vec_aux_dir_res) { - (Ok(seq_aux_dir), Ok(vec_aux_dir)) => { - assert_eq!( - seq_aux_dir, vec_aux_dir, - "Mismatch for key {} - keyspace={:?} lsn={}", - key, keyspace, lsn - ); - } - (Err(_), Err(_)) => {} - _ => { - panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}"); - } - } - } else { - // All other keys should reconstruct deterministically, so we simply compare the blobs. - assert_eq!( - seq, vec, - "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}" - ); - } - } - /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. pub(crate) fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last @@ -1449,12 +1214,7 @@ impl Timeline { /// Hence, the result **does not represent local filesystem usage**. pub(crate) async fn layer_size_sum(&self) -> u64 { let guard = self.layers.read().await; - let layer_map = guard.layer_map(); - let mut size = 0; - for l in layer_map.iter_historic_layers() { - size += l.file_size; - } - size + guard.layer_size_sum() } pub(crate) fn resident_physical_size(&self) -> u64 { @@ -1554,7 +1314,7 @@ impl Timeline { ) -> anyhow::Result<()> { ensure!( lsn >= **latest_gc_cutoff_lsn, - "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", + "LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)", lsn, **latest_gc_cutoff_lsn, ); @@ -1585,7 +1345,13 @@ impl Timeline { let existing_lease = occupied.get_mut(); if valid_until > existing_lease.valid_until { existing_lease.valid_until = valid_until; + let dt: DateTime = valid_until.into(); + info!("lease extended to {}", dt); + } else { + let dt: DateTime = existing_lease.valid_until.into(); + info!("existing lease covers greater length, valid until {}", dt); } + existing_lease.clone() } else { // Reject already GC-ed LSN (lsn < latest_gc_cutoff) @@ -1594,6 +1360,8 @@ impl Timeline { bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); } + let dt: DateTime = valid_until.into(); + info!("lease created, valid until {}", dt); entry.or_insert(LsnLease { valid_until }).clone() } }; @@ -1613,16 +1381,15 @@ impl Timeline { // This exists to provide a non-span creating version of `freeze_and_flush` we can call without // polluting the span hierarchy. pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> { - let to_lsn = { + let token = { // Freeze the current open in-memory layer. It will be written to disk on next // iteration. let mut g = self.write_lock.lock().await; let to_lsn = self.get_last_record_lsn(); - self.freeze_inmem_layer_at(to_lsn, &mut g).await; - to_lsn + self.freeze_inmem_layer_at(to_lsn, &mut g).await? }; - self.flush_frozen_layers_and_wait(to_lsn).await + self.wait_flush_completion(token).await } // Check if an open ephemeral layer should be closed: this provides @@ -1636,12 +1403,20 @@ impl Timeline { return; }; + // FIXME: why not early exit? because before #7927 the state would had been cleared every + // time, and this was missed. + // if write_guard.is_none() { return; } + let Ok(layers_guard) = self.layers.try_read() else { // Don't block if the layer lock is busy return; }; - let Some(open_layer) = &layers_guard.layer_map().open_layer else { + let Ok(lm) = layers_guard.layer_map() else { + return; + }; + + let Some(open_layer) = &lm.open_layer else { // If there is no open layer, we have no layer freezing to do. However, we might need to generate // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions // that didn't result in writes to this shard. @@ -1667,9 +1442,16 @@ impl Timeline { ); // The flush loop will update remote consistent LSN as well as disk consistent LSN. - self.flush_frozen_layers_and_wait(last_record_lsn) - .await - .ok(); + // We know there is no open layer, so we can request freezing without actually + // freezing anything. This is true even if we have dropped the layers_guard, we + // still hold the write_guard. + let _ = async { + let token = self + .freeze_inmem_layer_at(last_record_lsn, &mut write_guard) + .await?; + self.wait_flush_completion(token).await + } + .await; } } @@ -1707,45 +1489,39 @@ impl Timeline { self.last_freeze_at.load(), open_layer.get_opened_at(), ) { - let at_lsn = match open_layer.info() { + match open_layer.info() { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { // We may reach this point if the layer was already frozen by not yet flushed: flushing // happens asynchronously in the background. tracing::debug!( "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})" ); - None } InMemoryLayerInfo::Open { .. } => { // Upgrade to a write lock and freeze the layer drop(layers_guard); - let mut layers_guard = self.layers.write().await; - let froze = layers_guard - .try_freeze_in_memory_layer( - current_lsn, - &self.last_freeze_at, - &mut write_guard, - ) + let res = self + .freeze_inmem_layer_at(current_lsn, &mut write_guard) .await; - Some(current_lsn).filter(|_| froze) - } - }; - if let Some(lsn) = at_lsn { - let res: Result = self.flush_frozen_layers(lsn); - if let Err(e) = res { - tracing::info!("failed to flush frozen layer after background freeze: {e:#}"); + + if let Err(e) = res { + tracing::info!( + "failed to flush frozen layer after background freeze: {e:#}" + ); + } } } } } - /// Outermost timeline compaction operation; downloads needed layers. + /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending + /// compaction tasks. pub(crate) async fn compact( self: &Arc, cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { // most likely the cancellation token is from background task, but in tests it could be the // request task as well. @@ -1765,8 +1541,8 @@ impl Timeline { // compaction task goes over it's period (20s) which is quite often in production. let (_guard, _permit) = tokio::select! { tuple = prepare => { tuple }, - _ = self.cancel.cancelled() => return Ok(()), - _ = cancel.cancelled() => return Ok(()), + _ = self.cancel.cancelled() => return Ok(false), + _ = cancel.cancelled() => return Ok(false), }; let last_record_lsn = self.get_last_record_lsn(); @@ -1774,11 +1550,14 @@ impl Timeline { // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); - return Ok(()); + return Ok(false); } match self.get_compaction_algorithm_settings().kind { - CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await, + CompactionAlgorithm::Tiered => { + self.compact_tiered(cancel, ctx).await?; + Ok(false) + } CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await, } } @@ -1867,6 +1646,20 @@ impl Timeline { self.last_record_lsn.shutdown(); if try_freeze_and_flush { + if let Some((open, frozen)) = self + .layers + .read() + .await + .layer_map() + .map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len())) + .ok() + .filter(|(open, frozen)| *open || *frozen > 0) + { + tracing::info!(?open, frozen, "flushing and freezing on shutdown"); + } else { + // this is double-shutdown, ignore it + } + // we shut down walreceiver above, so, we won't add anything more // to the InMemoryLayer; freeze it and wait for all frozen layers // to reach the disk & upload queue, then shut the upload queue and @@ -1883,6 +1676,11 @@ impl Timeline { // about corner cases like s3 suddenly hanging up? self.remote_client.shutdown().await; } + Err(FlushLayerError::Cancelled) => { + // this is likely the second shutdown, ignore silently. + // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 + debug_assert!(self.cancel.is_cancelled()); + } Err(e) => { // Non-fatal. Shutdown is infallible. Failures to flush just mean that // we have some extra WAL replay to do next time the timeline starts. @@ -1895,9 +1693,13 @@ impl Timeline { tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); + // Ensure Prevent new page service requests from starting. + self.handles.shutdown(); + // Transition the remote_client into a state where it's only useful for timeline deletion. // (The deletion use case is why we can't just hook up remote_client to Self::cancel).) self.remote_client.stop(); + // As documented in remote_client.stop()'s doc comment, it's our responsibility // to shut down the upload queue tasks. // TODO: fix that, task management should be encapsulated inside remote_client. @@ -1908,10 +1710,17 @@ impl Timeline { ) .await; - // TODO: work toward making this a no-op. See this funciton's doc comment for more context. + // TODO: work toward making this a no-op. See this function's doc comment for more context. tracing::debug!("Waiting for tasks..."); task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; + { + // Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate + // open. + let mut write_guard = self.write_lock.lock().await; + self.layers.write().await.shutdown(&mut write_guard); + } + // Finally wait until any gate-holders are complete. // // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks @@ -1967,6 +1776,11 @@ impl Timeline { self.current_state() == TimelineState::Active } + #[allow(unused)] + pub(crate) fn is_archived(&self) -> Option { + self.remote_client.is_archived() + } + pub(crate) fn is_stopping(&self) -> bool { self.current_state() == TimelineState::Stopping } @@ -2000,9 +1814,12 @@ impl Timeline { } } - pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { + pub(crate) async fn layer_map_info( + &self, + reset: LayerAccessStatsReset, + ) -> Result { let guard = self.layers.read().await; - let layer_map = guard.layer_map(); + let layer_map = guard.layer_map()?; let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); if let Some(open_layer) = &layer_map.open_layer { in_memory_layers.push(open_layer.info()); @@ -2011,16 +1828,15 @@ impl Timeline { in_memory_layers.push(frozen_layer.info()); } - let mut historic_layers = Vec::new(); - for historic_layer in layer_map.iter_historic_layers() { - let historic_layer = guard.get_from_desc(&historic_layer); - historic_layers.push(historic_layer.info(reset)); - } + let historic_layers = layer_map + .iter_historic_layers() + .map(|desc| guard.get_from_desc(&desc).info(reset)) + .collect(); - LayerMapInfo { + Ok(LayerMapInfo { in_memory_layers, historic_layers, - } + }) } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] @@ -2028,7 +1844,7 @@ impl Timeline { &self, layer_file_name: &LayerName, ) -> anyhow::Result> { - let Some(layer) = self.find_layer(layer_file_name).await else { + let Some(layer) = self.find_layer(layer_file_name).await? else { return Ok(None); }; @@ -2049,7 +1865,7 @@ impl Timeline { .enter() .map_err(|_| anyhow::anyhow!("Shutting down"))?; - let Some(local_layer) = self.find_layer(layer_file_name).await else { + let Some(local_layer) = self.find_layer(layer_file_name).await? else { return Ok(None); }; @@ -2292,6 +2108,11 @@ impl Timeline { ) }; + if let Some(ancestor) = &ancestor { + let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); + ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn()); + } + Arc::new_cyclic(|myself| { let metrics = TimelineMetrics::new( &tenant_shard_id, @@ -2370,6 +2191,7 @@ impl Timeline { )), repartition_threshold: 0, last_image_layer_creation_check_at: AtomicLsn::new(0), + last_image_layer_creation_check_instant: Mutex::new(None), last_received_wal: Mutex::new(None), rel_size_cache: RwLock::new(RelSizeCache { @@ -2407,6 +2229,10 @@ impl Timeline { #[cfg(test)] extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())), + + l0_flush_global_state: resources.l0_flush_global_state, + + handles: Default::default(), }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; @@ -2456,10 +2282,9 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::LayerFlushTask, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), "layer flush task", - false, async move { let _guard = guard; let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); @@ -2526,7 +2351,10 @@ impl Timeline { let mut layers = self.layers.try_write().expect( "in the context where we call this function, no other task has access to the object", ); - layers.initialize_empty(Lsn(start_lsn.0)); + layers + .open_mut() + .expect("in this context the LayerManager must still be open") + .initialize_empty(Lsn(start_lsn.0)); } /// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only @@ -2658,7 +2486,10 @@ impl Timeline { let num_layers = loaded_layers.len(); - guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); + guard + .open_mut() + .expect("layermanager must be open during init") + .initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); self.remote_client .schedule_layer_file_deletion(&needs_cleanup)?; @@ -2691,6 +2522,10 @@ impl Timeline { // Tenant::create_timeline will wait for these uploads to happen before returning, or // on retry. + // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan) + drop(guard); // drop write lock, update_layer_visibility will take a read lock. + self.update_layer_visibility().await?; + info!( "loaded layer map with {} layers at {}, total physical size: {}", num_layers, disk_consistent_lsn, total_physical_size @@ -2801,10 +2636,9 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::InitialLogicalSizeCalculation, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), "initial size calculation", - false, // NB: don't log errors here, task_mgr will do that. async move { let cancel = task_mgr::shutdown_token(); @@ -2970,10 +2804,9 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::OndemandLogicalSizeCalculation, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), "ondemand logical size calculation", - false, async move { let res = self_clone .logical_size_calculation_task(lsn, cause, &ctx) @@ -3113,16 +2946,17 @@ impl Timeline { } } - async fn find_layer(&self, layer_name: &LayerName) -> Option { + async fn find_layer( + &self, + layer_name: &LayerName, + ) -> Result, layer_manager::Shutdown> { let guard = self.layers.read().await; - for historic_layer in guard.layer_map().iter_historic_layers() { - let historic_layer_name = historic_layer.layer_name(); - if layer_name == &historic_layer_name { - return Some(guard.get_from_desc(&historic_layer)); - } - } - - None + let layer = guard + .layer_map()? + .iter_historic_layers() + .find(|l| &l.layer_name() == layer_name) + .map(|found| guard.get_from_desc(&found)); + Ok(layer) } /// The timeline heatmap is a hint to secondary locations from the primary location, @@ -3139,17 +2973,37 @@ impl Timeline { let guard = self.layers.read().await; - let resident = guard.likely_resident_layers().map(|layer| { - let last_activity_ts = layer.access_stats().latest_activity_or_now(); - - HeatMapLayer::new( - layer.layer_desc().layer_name(), - layer.metadata(), - last_activity_ts, - ) + let resident = guard.likely_resident_layers().filter_map(|layer| { + match layer.visibility() { + LayerVisibilityHint::Visible => { + // Layer is visible to one or more read LSNs: elegible for inclusion in layer map + let last_activity_ts = layer.latest_activity(); + Some((layer.layer_desc(), layer.metadata(), last_activity_ts)) + } + LayerVisibilityHint::Covered => { + // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap. + None + } + } }); - let layers = resident.collect(); + let mut layers = resident.collect::>(); + + // Sort layers in order of which to download first. For a large set of layers to download, we + // want to prioritize those layers which are most likely to still be in the resident many minutes + // or hours later: + // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might + // only exist for a few minutes before being compacted into L1s. + // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner + // the layer is likely to be covered by an image layer during compaction. + layers.sort_by_key(|(desc, _meta, _atime)| { + std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end)) + }); + + let layers = layers + .into_iter() + .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime)) + .collect(); Some(HeatMapTimeline::new(self.timeline_id, layers)) } @@ -3164,229 +3018,9 @@ impl Timeline { } } -type TraversalId = Arc; - -trait TraversalLayerExt { - fn traversal_id(&self) -> TraversalId; -} - -impl TraversalLayerExt for Layer { - fn traversal_id(&self) -> TraversalId { - Arc::clone(self.debug_str()) - } -} - -impl TraversalLayerExt for Arc { - fn traversal_id(&self) -> TraversalId { - Arc::clone(self.local_path_str()) - } -} - impl Timeline { - /// - /// Get a handle to a Layer for reading. - /// - /// The returned Layer might be from an ancestor timeline, if the - /// segment hasn't been updated on this timeline yet. - /// - /// This function takes the current timeline's locked LayerMap as an argument, - /// so callers can avoid potential race conditions. - /// - /// # Cancel-Safety - /// - /// This method is cancellation-safe. - async fn get_reconstruct_data( - &self, - key: Key, - request_lsn: Lsn, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> Result, PageReconstructError> { - // Start from the current timeline. - let mut timeline_owned; - let mut timeline = self; - - let mut read_count = scopeguard::guard(0, |cnt| { - crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64) - }); - - // For debugging purposes, collect the path of layers that we traversed - // through. It's included in the error message if we fail to find the key. - let mut traversal_path = Vec::::new(); - - let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { - *cached_lsn - } else { - Lsn(0) - }; - - // 'prev_lsn' tracks the last LSN that we were at in our search. It's used - // to check that each iteration make some progress, to break infinite - // looping if something goes wrong. - let mut prev_lsn = None; - - let mut result = ValueReconstructResult::Continue; - let mut cont_lsn = Lsn(request_lsn.0 + 1); - - 'outer: loop { - if self.cancel.is_cancelled() { - return Err(PageReconstructError::Cancelled); - } - - // The function should have updated 'state' - //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); - match result { - ValueReconstructResult::Complete => return Ok(traversal_path), - ValueReconstructResult::Continue => { - // If we reached an earlier cached page image, we're done. - if cont_lsn == cached_lsn + 1 { - MATERIALIZED_PAGE_CACHE_HIT.inc_by(1); - return Ok(traversal_path); - } - if let Some(prev) = prev_lsn { - if prev <= cont_lsn { - // Didn't make any progress in last iteration. Error out to avoid - // getting stuck in the loop. - return Err(PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn: Lsn(cont_lsn.0 - 1), - request_lsn, - ancestor_lsn: Some(timeline.ancestor_lsn), - traversal_path, - backtrace: None, - })); - } - } - prev_lsn = Some(cont_lsn); - } - ValueReconstructResult::Missing => { - return Err(PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn, - request_lsn, - ancestor_lsn: None, - traversal_path, - backtrace: if cfg!(test) { - Some(std::backtrace::Backtrace::force_capture()) - } else { - None - }, - })); - } - } - - // Recurse into ancestor if needed - if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() { - if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - trace!( - "going into ancestor {}, cont_lsn is {}", - timeline.ancestor_lsn, - cont_lsn - ); - - timeline_owned = timeline - .get_ready_ancestor_timeline(ancestor_timeline, ctx) - .await?; - timeline = &*timeline_owned; - prev_lsn = None; - continue 'outer; - } - } - - let guard = timeline.layers.read().await; - let layers = guard.layer_map(); - - // Check the open and frozen in-memory layers first, in order from newest - // to oldest. - if let Some(open_layer) = &layers.open_layer { - let start_lsn = open_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display()); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, start_lsn); - - let open_layer = open_layer.clone(); - drop(guard); - - result = match open_layer - .get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - ctx, - ) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - *read_count += 1; - traversal_path.push((result, cont_lsn, open_layer.traversal_id())); - continue 'outer; - } - } - for frozen_layer in layers.frozen_layers.iter().rev() { - let start_lsn = frozen_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display()); - let lsn_floor = max(cached_lsn + 1, start_lsn); - - let frozen_layer = frozen_layer.clone(); - drop(guard); - - result = match frozen_layer - .get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - ctx, - ) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - *read_count += 1; - traversal_path.push((result, cont_lsn, frozen_layer.traversal_id())); - continue 'outer; - } - } - - if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { - let layer = guard.get_from_desc(&layer); - drop(guard); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = match layer - .get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state, ctx) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - *read_count += 1; - traversal_path.push((result, cont_lsn, layer.traversal_id())); - continue 'outer; - } else if timeline.ancestor_timeline.is_some() { - // Nothing on this timeline. Traverse to parent - result = ValueReconstructResult::Continue; - cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); - continue 'outer; - } else { - // Nothing found - result = ValueReconstructResult::Missing; - continue 'outer; - } - } - } - + #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint + #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace /// /// The algorithm is as follows: @@ -3465,8 +3099,7 @@ impl Timeline { cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); timeline_owned = timeline .get_ready_ancestor_timeline(ancestor_timeline, ctx) - .await - .map_err(GetVectoredError::GetReadyAncestorError)?; + .await?; timeline = &*timeline_owned; }; @@ -3479,7 +3112,6 @@ impl Timeline { cont_lsn, request_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), - traversal_path: vec![], backtrace: None, })); } @@ -3538,7 +3170,7 @@ impl Timeline { // which turns out to be a perf bottleneck in some cases. if !unmapped_keyspace.is_empty() { let guard = timeline.layers.read().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; let in_memory_layer = layers.find_in_memory_layer(|l| { let start_lsn = l.get_lsn_range().start; @@ -3614,26 +3246,6 @@ impl Timeline { }) } - /// # Cancel-safety - /// - /// This method is cancellation-safe. - async fn lookup_cached_page( - &self, - key: &Key, - lsn: Lsn, - ctx: &RequestContext, - ) -> Option<(Lsn, Bytes)> { - let cache = page_cache::get(); - - // FIXME: It's pointless to check the cache for things that are not 8kB pages. - // We should look at the key to determine if it's a cacheable object - let (lsn, read_guard) = cache - .lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx) - .await?; - let img = Bytes::from(read_guard.to_vec()); - Some((lsn, img)) - } - async fn get_ready_ancestor_timeline( &self, ancestor: &Arc, @@ -3691,30 +3303,50 @@ impl Timeline { Ok(ancestor.clone()) } - pub(crate) fn get_ancestor_timeline(&self) -> Option> { - self.ancestor_timeline.clone() - } - pub(crate) fn get_shard_identity(&self) -> &ShardIdentity { &self.shard_identity } + #[inline(always)] + pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId { + ShardTimelineId { + shard_index: ShardIndex { + shard_number: self.shard_identity.number, + shard_count: self.shard_identity.count, + }, + timeline_id: self.timeline_id, + } + } + + /// Returns a non-frozen open in-memory layer for ingestion. /// - /// Get a handle to the latest layer for appending. - /// + /// Takes a witness of timeline writer state lock being held, because it makes no sense to call + /// this function without holding the mutex. async fn get_layer_for_write( &self, lsn: Lsn, + _guard: &tokio::sync::MutexGuard<'_, Option>, ctx: &RequestContext, ) -> anyhow::Result> { let mut guard = self.layers.write().await; + let gate_guard = self.gate.enter().context("enter gate for inmem layer")?; + + let last_record_lsn = self.get_last_record_lsn(); + ensure!( + lsn > last_record_lsn, + "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", + lsn, + last_record_lsn, + ); + let layer = guard + .open_mut()? .get_layer_for_write( lsn, - self.get_last_record_lsn(), self.conf, self.timeline_id, self.tenant_shard_id, + gate_guard, ctx, ) .await?; @@ -3728,21 +3360,48 @@ impl Timeline { self.last_record_lsn.advance(new_lsn); } + /// Freeze any existing open in-memory layer and unconditionally notify the flush loop. + /// + /// Unconditional flush loop notification is given because in sharded cases we will want to + /// leave an Lsn gap. Unsharded tenants do not have Lsn gaps. async fn freeze_inmem_layer_at( &self, at: Lsn, write_lock: &mut tokio::sync::MutexGuard<'_, Option>, - ) { + ) -> Result { let frozen = { let mut guard = self.layers.write().await; guard + .open_mut()? .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock) .await }; + if frozen { let now = Instant::now(); *(self.last_freeze_ts.write().unwrap()) = now; } + + // Increment the flush cycle counter and wake up the flush task. + // Remember the new value, so that when we listen for the flush + // to finish, we know when the flush that we initiated has + // finished, instead of some other flush that was started earlier. + let mut my_flush_request = 0; + + let flush_loop_state = { *self.flush_loop_state.lock().unwrap() }; + if !matches!(flush_loop_state, FlushLoopState::Running { .. }) { + return Err(FlushLayerError::NotRunning(flush_loop_state)); + } + + self.layer_flush_start_tx.send_modify(|(counter, lsn)| { + my_flush_request = *counter + 1; + *counter = my_flush_request; + *lsn = std::cmp::max(at, *lsn); + }); + + assert_ne!(my_flush_request, 0); + + Ok(my_flush_request) } /// Layer flusher task's main loop. @@ -3779,7 +3438,11 @@ impl Timeline { let layer_to_flush = { let guard = self.layers.read().await; - guard.layer_map().frozen_layers.front().cloned() + let Ok(lm) = guard.layer_map() else { + info!("dropping out of flush loop for timeline shutdown"); + return; + }; + lm.frozen_layers.front().cloned() // drop 'layers' lock to allow concurrent reads and writes }; let Some(layer_to_flush) = layer_to_flush else { @@ -3836,34 +3499,7 @@ impl Timeline { } } - /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk. - /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`]. - /// - /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the - /// case, it means no data will be written between the top of the highest frozen layer and - /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data - /// locally for that part of the WAL. - fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result { - // Increment the flush cycle counter and wake up the flush task. - // Remember the new value, so that when we listen for the flush - // to finish, we know when the flush that we initiated has - // finished, instead of some other flush that was started earlier. - let mut my_flush_request = 0; - - let flush_loop_state = { *self.flush_loop_state.lock().unwrap() }; - if !matches!(flush_loop_state, FlushLoopState::Running { .. }) { - return Err(FlushLayerError::NotRunning(flush_loop_state)); - } - - self.layer_flush_start_tx.send_modify(|(counter, lsn)| { - my_flush_request = *counter + 1; - *counter = my_flush_request; - *lsn = std::cmp::max(at_lsn, *lsn); - }); - - Ok(my_flush_request) - } - + /// Waits any flush request created by [`Self::freeze_inmem_layer_at`] to complete. async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> { let mut rx = self.layer_flush_done_tx.subscribe(); loop { @@ -3896,11 +3532,6 @@ impl Timeline { } } - async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> { - let token = self.flush_frozen_layers(at_lsn)?; - self.wait_flush_completion(token).await - } - /// Flush one frozen in-memory layer to disk, as a new delta layer. /// /// Return value is the last lsn (inclusive) of the layer that was frozen. @@ -3959,34 +3590,6 @@ impl Timeline { return Err(FlushLayerError::Cancelled); } - // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well? - // This code path will not be hit during regression tests. After #7099 we have a single partition - // with two key ranges. If someone wants to fix initdb optimization in the future, this might need - // to be fixed. - - // For metadata, always create delta layers. - let delta_layer = if !metadata_partition.parts.is_empty() { - assert_eq!( - metadata_partition.parts.len(), - 1, - "currently sparse keyspace should only contain a single metadata keyspace" - ); - let metadata_keyspace = &metadata_partition.parts[0]; - self.create_delta_layer( - &frozen_layer, - Some( - metadata_keyspace.0.ranges.first().unwrap().start - ..metadata_keyspace.0.ranges.last().unwrap().end, - ), - ctx, - ) - .await - .map_err(|e| FlushLayerError::from_anyhow(self, e))? - } else { - None - }; - - // For image layers, we add them immediately into the layer map. let mut layers_to_upload = Vec::new(); layers_to_upload.extend( self.create_image_layers( @@ -3997,13 +3600,27 @@ impl Timeline { ) .await?, ); - - if let Some(delta_layer) = delta_layer { - layers_to_upload.push(delta_layer.clone()); - (layers_to_upload, Some(delta_layer)) - } else { - (layers_to_upload, None) + if !metadata_partition.parts.is_empty() { + assert_eq!( + metadata_partition.parts.len(), + 1, + "currently sparse keyspace should only contain a single metadata keyspace" + ); + layers_to_upload.extend( + self.create_image_layers( + // Safety: create_image_layers treat sparse keyspaces differently that it does not scan + // every single key within the keyspace, and therefore, it's safe to force converting it + // into a dense keyspace before calling this function. + &metadata_partition.into_dense(), + self.initdb_lsn, + ImageLayerCreationMode::Initial, + ctx, + ) + .await?, + ); } + + (layers_to_upload, None) } else { // Normal case, write out a L0 delta layer file. // `create_delta_layer` will not modify the layer map. @@ -4037,11 +3654,11 @@ impl Timeline { { let mut guard = self.layers.write().await; - if self.cancel.is_cancelled() { - return Err(FlushLayerError::Cancelled); - } - - guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics); + guard.open_mut()?.finish_flush_l0_layer( + delta_layer_to_add.as_ref(), + &frozen_layer, + &self.metrics, + ); if self.set_disk_consistent_lsn(disk_consistent_lsn) { // Schedule remote uploads that will reflect our new disk_consistent_lsn @@ -4051,6 +3668,21 @@ impl Timeline { // release lock on 'layers' }; + // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files. + // This makes us refuse ingest until the new layers have been persisted to the remote. + self.remote_client + .wait_completion() + .await + .map_err(|e| match e { + WaitCompletionError::UploadQueueShutDownOrStopped + | WaitCompletionError::NotInitialized( + NotInitialized::ShuttingDown | NotInitialized::Stopped, + ) => FlushLayerError::Cancelled, + WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => { + FlushLayerError::Other(anyhow!(e).into()) + } + })?; + // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`, // a compaction can delete the file and then it won't be available for uploads any more. // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this @@ -4066,17 +3698,11 @@ impl Timeline { /// Return true if the value changed /// - /// This function must only be used from the layer flush task, and may not be called concurrently. + /// This function must only be used from the layer flush task. fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { - // We do a simple load/store cycle: that's why this function isn't safe for concurrent use. - let old_value = self.disk_consistent_lsn.load(); - if new_value != old_value { - assert!(new_value >= old_value); - self.disk_consistent_lsn.store(new_value); - true - } else { - false - } + let old_value = self.disk_consistent_lsn.fetch_max(new_value); + assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"); + new_value != old_value } /// Update metadata file @@ -4143,12 +3769,14 @@ impl Timeline { let frozen_layer = Arc::clone(frozen_layer); let ctx = ctx.attached_child(); let work = async move { - let Some(new_delta) = frozen_layer - .write_to_disk(&self_clone, &ctx, key_range) + let Some((desc, path)) = frozen_layer + .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner()) .await? else { return Ok(None); }; + let new_delta = Layer::finish_creating(self_clone.conf, &self_clone, desc, &path)?; + // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. @@ -4238,7 +3866,9 @@ impl Timeline { let threshold = self.get_image_creation_threshold(); let guard = self.layers.read().await; - let layers = guard.layer_map(); + let Ok(layers) = guard.layer_map() else { + return false; + }; let mut max_deltas = 0; for part_range in &partition.ranges { @@ -4325,6 +3955,10 @@ impl Timeline { .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) .await?; + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); + } + for (img_key, img) in results { let img = match img { Ok(img) => img, @@ -4348,7 +3982,7 @@ impl Timeline { warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); ZERO_PAGE.clone() } else { - return Err(CreateImageLayersError::PageReconstructError(err)); + return Err(CreateImageLayersError::from(err)); } } }; @@ -4396,8 +4030,6 @@ impl Timeline { mode: ImageLayerCreationMode, start: Key, ) -> Result { - assert!(!matches!(mode, ImageLayerCreationMode::Initial)); - // Metadata keys image layer creation. let mut reconstruct_state = ValuesReconstructState::default(); let data = self @@ -4408,7 +4040,7 @@ impl Timeline { let mut total_kb_retrieved = 0; let mut total_keys_retrieved = 0; for (k, v) in data { - let v = v.map_err(CreateImageLayersError::PageReconstructError)?; + let v = v?; total_kb_retrieved += KEY_SIZE + v.len(); total_keys_retrieved += 1; new_data.insert(k, v); @@ -4432,6 +4064,9 @@ impl Timeline { next_start_key: img_range.end, }); } + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); + } let mut wrote_any_image = false; for (k, v) in data { if v.is_empty() { @@ -4469,6 +4104,58 @@ impl Timeline { } } + /// Predicate function which indicates whether we should check if new image layers + /// are required. Since checking if new image layers are required is expensive in + /// terms of CPU, we only do it in the following cases: + /// 1. If the timeline has ingested sufficient WAL to justify the cost + /// 2. If enough time has passed since the last check: + /// 1. For large tenants, we wish to perform the check more often since they + /// suffer from the lack of image layers + /// 2. For small tenants (that can mostly fit in RAM), we use a much longer interval + fn should_check_if_image_layers_required(self: &Arc, lsn: Lsn) -> bool { + const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024; + + let last_checks_at = self.last_image_layer_creation_check_at.load(); + let distance = lsn + .checked_sub(last_checks_at) + .expect("Attempt to compact with LSN going backwards"); + let min_distance = + self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance(); + + let distance_based_decision = distance.0 >= min_distance; + + let mut time_based_decision = false; + let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap(); + if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() { + let check_required_after = if Into::::into(&logical_size) >= LARGE_TENANT_THRESHOLD + { + self.get_checkpoint_timeout() + } else { + Duration::from_secs(3600 * 48) + }; + + time_based_decision = match *last_check_instant { + Some(last_check) => { + let elapsed = last_check.elapsed(); + elapsed >= check_required_after + } + None => true, + }; + } + + // Do the expensive delta layer counting only if this timeline has ingested sufficient + // WAL since the last check or a checkpoint timeout interval has elapsed since the last + // check. + let decision = distance_based_decision || time_based_decision; + + if decision { + self.last_image_layer_creation_check_at.store(lsn); + *last_check_instant = Some(Instant::now()); + } + + decision + } + #[tracing::instrument(skip_all, fields(%lsn, %mode))] async fn create_image_layers( self: &Arc, @@ -4491,24 +4178,13 @@ impl Timeline { // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. let mut start = Key::MIN; - let check_for_image_layers = { - let last_checks_at = self.last_image_layer_creation_check_at.load(); - let distance = lsn - .checked_sub(last_checks_at) - .expect("Attempt to compact with LSN going backwards"); - let min_distance = self.get_image_layer_creation_check_threshold() as u64 - * self.get_checkpoint_distance(); - - // Skip the expensive delta layer counting if this timeline has not ingested sufficient - // WAL since the last check. - distance.0 >= min_distance - }; - - if check_for_image_layers { - self.last_image_layer_creation_check_at.store(lsn); - } + let check_for_image_layers = self.should_check_if_image_layers_required(lsn); for partition in partitioning.parts.iter() { + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); + } + let img_range = start..partition.ranges.last().unwrap().end; let compact_metadata = partition.overlaps(&Key::metadata_key_range()); if compact_metadata { @@ -4519,15 +4195,13 @@ impl Timeline { "metadata keys must be partitioned separately" ); } - if mode == ImageLayerCreationMode::Initial { - return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers"))); - } if mode == ImageLayerCreationMode::Try && !check_for_image_layers { // Skip compaction if there are not enough updates. Metadata compaction will do a scan and // might mess up with evictions. start = img_range.end; continue; } + // For initial and force modes, we always generate image layers for metadata keys. } else if let ImageLayerCreationMode::Try = mode { // check_for_image_layers = false -> skip // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate @@ -4536,6 +4210,24 @@ impl Timeline { continue; } } + if let ImageLayerCreationMode::Force = mode { + // When forced to create image layers, we might try and create them where they already + // exist. This mode is only used in tests/debug. + let layers = self.layers.read().await; + if layers.contains_key(&PersistentLayerKey { + key_range: img_range.clone(), + lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn), + is_delta: false, + }) { + tracing::info!( + "Skipping image layer at {lsn} {}..{}, already exists", + img_range.start, + img_range.end + ); + start = img_range.end; + continue; + } + } let image_layer_writer = ImageLayerWriter::new( self.conf, @@ -4590,35 +4282,20 @@ impl Timeline { } } - // The writer.finish() above already did the fsync of the inodes. - // We just need to fsync the directory in which these inodes are linked, - // which we know to be the timeline directory. - if !image_layers.is_empty() { - // We use fatal_err() below because the after writer.finish() returns with success, - // the in-memory state of the filesystem already has the layer file in its final place, - // and subsequent pageserver code could think it's durable while it really isn't. - let timeline_dir = VirtualFile::open( - &self - .conf - .timeline_path(&self.tenant_shard_id, &self.timeline_id), - ctx, - ) - .await - .fatal_err("VirtualFile::open for timeline dir fsync"); - timeline_dir - .sync_all() - .await - .fatal_err("VirtualFile::sync_all timeline dir"); - } - let mut guard = self.layers.write().await; // FIXME: we could add the images to be uploaded *before* returning from here, but right - // now they are being scheduled outside of write lock - guard.track_new_image_layers(&image_layers, &self.metrics); + // now they are being scheduled outside of write lock; current way is inconsistent with + // compaction lock order. + guard + .open_mut()? + .track_new_image_layers(&image_layers, &self.metrics); drop_wlock(guard); timer.stop_and_record(); + // Creating image layers may have caused some previously visible layers to be covered + self.update_layer_visibility().await?; + Ok(image_layers) } @@ -4636,6 +4313,12 @@ impl Timeline { return; } + if self.current_logical_size.current_size().is_exact() { + // root timelines are initialized with exact count, but never start the background + // calculation + return; + } + if let Some(await_bg_cancel) = self .current_logical_size .cancel_wait_for_background_loop_concurrency_limit_semaphore @@ -4665,7 +4348,7 @@ impl Timeline { /// Requires a timeline that: /// - has an ancestor to detach from /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not - /// a technical requirement + /// a technical requirement /// /// After the operation has been started, it cannot be canceled. Upon restart it needs to be /// polled again until completion. @@ -4677,28 +4360,38 @@ impl Timeline { tenant: &crate::tenant::Tenant, options: detach_ancestor::Options, ctx: &RequestContext, - ) -> Result< - ( - completion::Completion, - detach_ancestor::PreparedTimelineDetach, - ), - detach_ancestor::Error, - > { + ) -> Result { detach_ancestor::prepare(self, tenant, options, ctx).await } - /// Completes the ancestor detach. This method is to be called while holding the - /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any - /// timeline be deleted. After this method returns successfully, tenant must be reloaded. + /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and + /// reparents any reparentable children of previous ancestor. /// - /// Pageserver receiving a SIGKILL during this operation is not supported (yet). - pub(crate) async fn complete_detaching_timeline_ancestor( + /// This method is to be called while holding the TenantManager's tenant slot, so during this + /// method we cannot be deleted nor can any timeline be deleted. After this method returns + /// successfully, tenant must be reloaded. + /// + /// Final step will be to [`Self::complete_detaching_timeline_ancestor`] after optionally + /// resetting the tenant. + pub(crate) async fn detach_from_ancestor_and_reparent( self: &Arc, tenant: &crate::tenant::Tenant, prepared: detach_ancestor::PreparedTimelineDetach, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - detach_ancestor::complete(self, tenant, prepared, ctx).await + ) -> Result { + detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await + } + + /// Final step which unblocks the GC. + /// + /// The tenant must've been reset if ancestry was modified previously (in tenant manager). + pub(crate) async fn complete_detaching_timeline_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + attempt: detach_ancestor::Attempt, + ctx: &RequestContext, + ) -> Result<(), detach_ancestor::Error> { + detach_ancestor::complete(self, tenant, attempt, ctx).await } /// Switch aux file policy and schedule upload to the index part. @@ -4710,6 +4403,18 @@ impl Timeline { } } +impl Drop for Timeline { + fn drop(&mut self) { + if let Some(ancestor) = &self.ancestor_timeline { + // This lock should never be poisoned, but in case it is we do a .map() instead of + // an unwrap(), to avoid panicking in a destructor and thereby aborting the process. + if let Ok(mut gc_info) = ancestor.gc_info.write() { + gc_info.remove_child(self.timeline_id) + } + } + } +} + /// Top-level failure to compact. #[derive(Debug, thiserror::Error)] pub(crate) enum CompactionError { @@ -4717,7 +4422,7 @@ pub(crate) enum CompactionError { ShuttingDown, /// Compaction cannot be done right now; page reconstruction and so on. #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), } impl From for CompactionError { @@ -4732,6 +4437,46 @@ impl From for CompactionError { } } +impl From for CompactionError { + fn from(value: super::upload_queue::NotInitialized) -> Self { + match value { + super::upload_queue::NotInitialized::Uninitialized => { + CompactionError::Other(anyhow::anyhow!(value)) + } + super::upload_queue::NotInitialized::ShuttingDown + | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown, + } + } +} + +impl From for CompactionError { + fn from(e: super::storage_layer::layer::DownloadError) -> Self { + match e { + super::storage_layer::layer::DownloadError::TimelineShutdown + | super::storage_layer::layer::DownloadError::DownloadCancelled => { + CompactionError::ShuttingDown + } + super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads + | super::storage_layer::layer::DownloadError::DownloadRequired + | super::storage_layer::layer::DownloadError::NotFile(_) + | super::storage_layer::layer::DownloadError::DownloadFailed + | super::storage_layer::layer::DownloadError::PreStatFailed(_) => { + CompactionError::Other(anyhow::anyhow!(e)) + } + #[cfg(test)] + super::storage_layer::layer::DownloadError::Failpoint(_) => { + CompactionError::Other(anyhow::anyhow!(e)) + } + } + } +} + +impl From for CompactionError { + fn from(_: layer_manager::Shutdown) -> Self { + CompactionError::ShuttingDown + } +} + #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); @@ -4763,14 +4508,63 @@ impl DurationRecorder { } } +/// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the +/// delta layer might be different from the min/max key/lsn in the delta layer. Therefore, +/// the layer descriptor requires the user to provide the ranges, which should cover all +/// keys specified in the `data` field. +#[cfg(test)] +#[derive(Clone)] +pub struct DeltaLayerTestDesc { + pub lsn_range: Range, + pub key_range: Range, + pub data: Vec<(Key, Lsn, Value)>, +} + +#[cfg(test)] +impl DeltaLayerTestDesc { + #[allow(dead_code)] + pub fn new(lsn_range: Range, key_range: Range, data: Vec<(Key, Lsn, Value)>) -> Self { + Self { + lsn_range, + key_range, + data, + } + } + + pub fn new_with_inferred_key_range( + lsn_range: Range, + data: Vec<(Key, Lsn, Value)>, + ) -> Self { + let key_min = data.iter().map(|(key, _, _)| key).min().unwrap(); + let key_max = data.iter().map(|(key, _, _)| key).max().unwrap(); + Self { + key_range: (*key_min)..(key_max.next()), + lsn_range, + data, + } + } + + pub(crate) fn layer_name(&self) -> LayerName { + LayerName::Delta(super::storage_layer::DeltaLayerName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + }) + } +} + impl Timeline { async fn finish_compact_batch( self: &Arc, new_deltas: &[ResidentLayer], new_images: &[ResidentLayer], layers_to_remove: &[Layer], - ) -> anyhow::Result<()> { - let mut guard = self.layers.write().await; + ) -> Result<(), CompactionError> { + let mut guard = tokio::select! { + guard = self.layers.write() => guard, + _ = self.cancel.cancelled() => { + return Err(CompactionError::ShuttingDown); + } + }; let mut duplicated_layers = HashSet::new(); @@ -4786,8 +4580,8 @@ impl Timeline { // for compact_level0_phase1 creating an L0, which does not happen in practice // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); - } else if LayerMap::is_l0(l.layer_desc()) { - bail!("compaction generates a L0 layer file as output, which will cause infinite compaction."); + } else if LayerMap::is_l0(&l.layer_desc().key_range) { + return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); } else { insert_layers.push(l.clone()); } @@ -4801,11 +4595,14 @@ impl Timeline { .collect(); if !new_images.is_empty() { - guard.track_new_image_layers(new_images, &self.metrics); + guard + .open_mut()? + .track_new_image_layers(new_images, &self.metrics); } - // deletion will happen later, the layer file manager calls garbage_collect_on_drop - guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); + guard + .open_mut()? + .finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); self.remote_client .schedule_compaction_update(&remove_layers, new_deltas)?; @@ -4819,7 +4616,7 @@ impl Timeline { self: &Arc, mut replace_layers: Vec<(Layer, ResidentLayer)>, mut drop_layers: Vec, - ) -> anyhow::Result<()> { + ) -> Result<(), CompactionError> { let mut guard = self.layers.write().await; // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want @@ -4827,7 +4624,9 @@ impl Timeline { replace_layers.retain(|(l, _)| guard.contains(l)); drop_layers.retain(|l| guard.contains(l)); - guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics); + guard + .open_mut()? + .rewrite_layers(&replace_layers, &drop_layers, &self.metrics); let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); @@ -4841,7 +4640,7 @@ impl Timeline { fn upload_new_image_layers( self: &Arc, new_images: impl IntoIterator, - ) -> anyhow::Result<()> { + ) -> Result<(), super::upload_queue::NotInitialized> { for layer in new_images { self.remote_client.schedule_layer_file_upload(layer)?; } @@ -4854,24 +4653,21 @@ impl Timeline { } /// Find the Lsns above which layer files need to be retained on - /// garbage collection. This is separate from actually performing the GC, - /// and is updated more frequently, so that compaction can remove obsolete - /// page versions more aggressively. + /// garbage collection. /// - /// TODO: that's wishful thinking, compaction doesn't actually do that - /// currently. + /// We calculate two cutoffs, one based on time and one based on WAL size. `pitr` + /// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls + /// the space-based retention. /// - /// The 'cutoff_horizon' point is used to retain recent versions that might still be - /// needed by read-only nodes. (As of this writing, the caller just passes - /// the latest LSN subtracted by a constant, and doesn't do anything smart - /// to figure out what read-only nodes might actually need.) - /// - /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine - /// whether a record is needed for PITR. + /// This function doesn't simply to calculate time & space based retention: it treats time-based + /// retention as authoritative if enabled, and falls back to space-based retention if calculating + /// the LSN for a time point isn't possible. Therefore the GcCutoffs::horizon in the response might + /// be different to the `space_cutoff` input. Callers should treat the min() of the two cutoffs + /// in the response as the GC cutoff point for the timeline. #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] pub(super) async fn find_gc_cutoffs( &self, - cutoff_horizon: Lsn, + space_cutoff: Lsn, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, @@ -4884,58 +4680,87 @@ impl Timeline { pausable_failpoint!("Timeline::find_gc_cutoffs-pausable"); - // First, calculate pitr_cutoff_timestamp and then convert it to LSN. - // - // Some unit tests depend on garbage-collection working even when - // CLOG data is missing, so that find_lsn_for_timestamp() doesn't - // work, so avoid calling it altogether if time-based retention is not - // configured. It would be pointless anyway. - let pitr_cutoff = if pitr != Duration::ZERO { - let now = SystemTime::now(); - if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { - let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - - match self - .find_lsn_for_timestamp(pitr_timestamp, cancel, ctx) - .await? - { - LsnForTimestamp::Present(lsn) => lsn, - LsnForTimestamp::Future(lsn) => { - // The timestamp is in the future. That sounds impossible, - // but what it really means is that there hasn't been - // any commits since the cutoff timestamp. - // - // In this case we should use the LSN of the most recent commit, - // which is implicitly the last LSN in the log. - debug!("future({})", lsn); - self.get_last_record_lsn() - } - LsnForTimestamp::Past(lsn) => { - debug!("past({})", lsn); - // conservative, safe default is to remove nothing, when we - // have no commit timestamp data available - *self.get_latest_gc_cutoff_lsn() - } - LsnForTimestamp::NoData(lsn) => { - debug!("nodata({})", lsn); - // conservative, safe default is to remove nothing, when we - // have no commit timestamp data available - *self.get_latest_gc_cutoff_lsn() - } - } - } else { - // If we don't have enough data to convert to LSN, - // play safe and don't remove any layers. - *self.get_latest_gc_cutoff_lsn() + if cfg!(test) { + // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup + if pitr == Duration::ZERO { + return Ok(GcCutoffs { + time: self.get_last_record_lsn(), + space: space_cutoff, + }); + } + } + + // Calculate a time-based limit on how much to retain: + // - if PITR interval is set, then this is our cutoff. + // - if PITR interval is not set, then we do a lookup + // based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases. + let time_cutoff = { + let now = SystemTime::now(); + let time_range = if pitr == Duration::ZERO { + humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") + } else { + pitr + }; + + // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case) + let time_cutoff = now.checked_sub(time_range).unwrap_or(now); + let timestamp = to_pg_timestamp(time_cutoff); + + match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? { + LsnForTimestamp::Present(lsn) => Some(lsn), + LsnForTimestamp::Future(lsn) => { + // The timestamp is in the future. That sounds impossible, + // but what it really means is that there hasn't been + // any commits since the cutoff timestamp. + // + // In this case we should use the LSN of the most recent commit, + // which is implicitly the last LSN in the log. + debug!("future({})", lsn); + Some(self.get_last_record_lsn()) + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + None + } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + None + } } - } else { - // No time-based retention was configured. Interpret this as "keep no history". - self.get_last_record_lsn() }; - Ok(GcCutoffs { - horizon: cutoff_horizon, - pitr: pitr_cutoff, + Ok(match (pitr, time_cutoff) { + (Duration::ZERO, Some(time_cutoff)) => { + // PITR is not set. Retain the size-based limit, or the default time retention, + // whichever requires less data. + GcCutoffs { + time: self.get_last_record_lsn(), + space: std::cmp::max(time_cutoff, space_cutoff), + } + } + (Duration::ZERO, None) => { + // PITR is not set, and time lookup failed + GcCutoffs { + time: self.get_last_record_lsn(), + space: space_cutoff, + } + } + (_, None) => { + // PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR + // cannot advance beyond what was already GC'd, and respect space-based retention + GcCutoffs { + time: *self.get_latest_gc_cutoff_lsn(), + space: space_cutoff, + } + } + (_, Some(time_cutoff)) => { + // PITR interval is set and we looked up timestamp successfully. Ignore + // size based retention and make time cutoff authoritative + GcCutoffs { + time: time_cutoff, + space: time_cutoff, + } + } }) } @@ -4960,12 +4785,16 @@ impl Timeline { return Err(GcError::TimelineCancelled); } - let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = { + let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = { let gc_info = self.gc_info.read().unwrap(); - let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn()); - let pitr_cutoff = gc_info.cutoffs.pitr; - let retain_lsns = gc_info.retain_lsns.clone(); + let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn()); + let time_cutoff = gc_info.cutoffs.time; + let retain_lsns = gc_info + .retain_lsns + .iter() + .map(|(lsn, _child_id)| *lsn) + .collect(); // Gets the maximum LSN that holds the valid lease. // @@ -4974,14 +4803,14 @@ impl Timeline { let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn); ( - horizon_cutoff, - pitr_cutoff, + space_cutoff, + time_cutoff, retain_lsns, max_lsn_with_valid_lease, ) }; - let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff); let standby_horizon = self.standby_horizon.load(); // Hold GC for the standby, but as a safety guard do it only within some // reasonable lag. @@ -5010,8 +4839,8 @@ impl Timeline { let res = self .gc_timeline( - horizon_cutoff, - pitr_cutoff, + space_cutoff, + time_cutoff, retain_lsns, max_lsn_with_valid_lease, new_gc_cutoff, @@ -5029,8 +4858,8 @@ impl Timeline { async fn gc_timeline( &self, - horizon_cutoff: Lsn, - pitr_cutoff: Lsn, + space_cutoff: Lsn, + time_cutoff: Lsn, retain_lsns: Vec, max_lsn_with_valid_lease: Option, new_gc_cutoff: Lsn, @@ -5086,27 +4915,27 @@ impl Timeline { // // TODO holding a write lock is too agressive and avoidable let mut guard = self.layers.write().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; 'outer: for l in layers.iter_historic_layers() { result.layers_total += 1; // 1. Is it newer than GC horizon cutoff point? - if l.get_lsn_range().end > horizon_cutoff { + if l.get_lsn_range().end > space_cutoff { debug!( - "keeping {} because it's newer than horizon_cutoff {}", + "keeping {} because it's newer than space_cutoff {}", l.layer_name(), - horizon_cutoff, + space_cutoff, ); result.layers_needed_by_cutoff += 1; continue 'outer; } // 2. It is newer than PiTR cutoff point? - if l.get_lsn_range().end > pitr_cutoff { + if l.get_lsn_range().end > time_cutoff { debug!( - "keeping {} because it's newer than pitr_cutoff {}", + "keeping {} because it's newer than time_cutoff {}", l.layer_name(), - pitr_cutoff, + time_cutoff, ); result.layers_needed_by_pitr += 1; continue 'outer; @@ -5204,17 +5033,9 @@ impl Timeline { result.layers_removed = gc_layers.len() as u64; - self.remote_client - .schedule_gc_update(&gc_layers) - .map_err(|e| { - if self.cancel.is_cancelled() { - GcError::TimelineCancelled - } else { - GcError::Remote(e) - } - })?; + self.remote_client.schedule_gc_update(&gc_layers)?; - guard.finish_gc_timeline(&gc_layers); + guard.open_mut()?.finish_gc_timeline(&gc_layers); #[cfg(feature = "testing")] { @@ -5279,39 +5100,22 @@ impl Timeline { } else { trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); }; - - let last_rec_lsn = data.records.last().unwrap().0; - - let img = match self + let res = self .walredo_mgr .as_ref() .context("timeline has no walredo manager") .map_err(PageReconstructError::WalRedo)? .request_redo(key, request_lsn, data.img, data.records, self.pg_version) - .await - .context("reconstruct a page image") - { + .await; + let img = match res { Ok(img) => img, - Err(e) => return Err(PageReconstructError::WalRedo(e)), - }; - - if img.len() == page_cache::PAGE_SZ { - let cache = page_cache::get(); - if let Err(e) = cache - .memorize_materialized_page( - self.tenant_shard_id, - self.timeline_id, - key, - last_rec_lsn, - &img, - ) - .await - .context("Materialized page memoization failed") - { - return Err(PageReconstructError::from(e)); + Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), + Err(walredo::Error::Other(e)) => { + return Err(PageReconstructError::WalRedo( + e.context("reconstruct a page image"), + )) } - } - + }; Ok(img) } } @@ -5343,10 +5147,9 @@ impl Timeline { let task_id = task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), "download all remote layers task", - false, async move { self_clone.download_all_remote_layers(request).await; let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); @@ -5388,9 +5191,13 @@ impl Timeline { let remaining = { let guard = self.layers.read().await; - guard - .layer_map() - .iter_historic_layers() + let Ok(lm) = guard.layer_map() else { + // technically here we could look into iterating accessible layers, but downloading + // all layers of a shutdown timeline makes no sense regardless. + tracing::info!("attempted to download all layers of shutdown timeline"); + return; + }; + lm.iter_historic_layers() .map(|desc| guard.get_from_desc(&desc)) .collect::>() }; @@ -5497,12 +5304,13 @@ impl Timeline { let file_size = layer.layer_desc().file_size; max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); - let last_activity_ts = layer.access_stats().latest_activity_or_now(); + let last_activity_ts = layer.latest_activity(); EvictionCandidate { - layer: layer.into(), + layer: layer.to_owned().into(), last_activity_ts, relative_last_activity: finite_f32::FiniteF32::ZERO, + visibility: layer.visibility(), } }) .collect(); @@ -5520,6 +5328,22 @@ impl Timeline { } } + /// Persistently blocks gc for `Manual` reason. + /// + /// Returns true if no such block existed before, false otherwise. + pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result { + use crate::tenant::remote_timeline_client::index::GcBlockingReason; + assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id); + tenant.gc_block.insert(self, GcBlockingReason::Manual).await + } + + /// Persistently unblocks gc for `Manual` reason. + pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> { + use crate::tenant::remote_timeline_client::index::GcBlockingReason; + assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id); + tenant.gc_block.remove(self, GcBlockingReason::Manual).await + } + #[cfg(test)] pub(super) fn force_advance_lsn(self: &Arc, new_lsn: Lsn) { self.last_record_lsn.advance(new_lsn); @@ -5552,12 +5376,12 @@ impl Timeline { } images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb)); let min_key = *images.first().map(|(k, _)| k).unwrap(); - let max_key = images.last().map(|(k, _)| k).unwrap().next(); + let end_key = images.last().map(|(k, _)| k).unwrap().next(); let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, - &(min_key..max_key), + &(min_key..end_key), lsn, ctx, ) @@ -5569,7 +5393,7 @@ impl Timeline { { let mut guard = self.layers.write().await; - guard.force_insert_layer(image_layer); + guard.open_mut().unwrap().force_insert_layer(image_layer); } Ok(()) @@ -5582,41 +5406,68 @@ impl Timeline { #[cfg(test)] pub(super) async fn force_create_delta_layer( self: &Arc, - mut deltas: Vec<(Key, Lsn, Value)>, + mut deltas: DeltaLayerTestDesc, check_start_lsn: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { let last_record_lsn = self.get_last_record_lsn(); - deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); - let min_key = *deltas.first().map(|(k, _, _)| k).unwrap(); - let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next(); - let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap(); - let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap(); + deltas + .data + .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); + assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start); + assert!(deltas.data.last().unwrap().0 < deltas.key_range.end); + for (_, lsn, _) in &deltas.data { + assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end); + } assert!( - max_lsn <= last_record_lsn, - "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}" + deltas.lsn_range.end <= last_record_lsn, + "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}", + deltas.lsn_range.end, + last_record_lsn ); - let end_lsn = Lsn(max_lsn.0 + 1); if let Some(check_start_lsn) = check_start_lsn { - assert!(min_lsn >= check_start_lsn); + assert!(deltas.lsn_range.start >= check_start_lsn); + } + // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of + // layers of the same start/end LSN, and so should the force inserted layer + { + /// Checks if a overlaps with b, assume a/b = [start, end). + pub fn overlaps_with(a: &Range, b: &Range) -> bool { + !(a.end <= b.start || b.end <= a.start) + } + + let guard = self.layers.read().await; + for layer in guard.layer_map()?.iter_historic_layers() { + if layer.is_delta() + && overlaps_with(&layer.lsn_range, &deltas.lsn_range) + && layer.lsn_range != deltas.lsn_range + { + // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic + panic!( + "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}", + deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end + ); + } + } } let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, - min_key, - min_lsn..end_lsn, + deltas.key_range.start, + deltas.lsn_range, ctx, ) .await?; - for (key, lsn, val) in deltas { + for (key, lsn, val) in deltas.data { delta_layer_writer.put_value(key, lsn, val, ctx).await?; } - let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?; + let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?; { let mut guard = self.layers.write().await; - guard.force_insert_layer(delta_layer); + guard.open_mut().unwrap().force_insert_layer(delta_layer); } Ok(()) @@ -5631,7 +5482,7 @@ impl Timeline { ) -> anyhow::Result> { let mut all_data = Vec::new(); let guard = self.layers.read().await; - for layer in guard.layer_map().iter_historic_layers() { + for layer in guard.layer_map()?.iter_historic_layers() { if !layer.is_delta() && layer.image_layer_lsn() == lsn { let layer = guard.get_from_desc(&layer); let mut reconstruct_data = ValuesReconstructState::default(); @@ -5659,7 +5510,7 @@ impl Timeline { ) -> anyhow::Result> { let mut layers = Vec::new(); let guard = self.layers.read().await; - for layer in guard.layer_map().iter_historic_layers() { + for layer in guard.layer_map()?.iter_historic_layers() { layers.push(layer.key()); } Ok(layers) @@ -5673,12 +5524,10 @@ impl Timeline { } } -type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId); - /// Tracking writes ingestion does to a particular in-memory layer. /// /// Cleared upon freezing a layer. -struct TimelineWriterState { +pub(crate) struct TimelineWriterState { open_layer: Arc, current_size: u64, // Previous Lsn which passed through @@ -5726,44 +5575,6 @@ enum OpenLayerAction { } impl<'a> TimelineWriter<'a> { - /// Put a new page version that can be constructed from a WAL record - /// - /// This will implicitly extend the relation, if the page is beyond the - /// current end-of-file. - pub(crate) async fn put( - &mut self, - key: Key, - lsn: Lsn, - value: &Value, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - // Avoid doing allocations for "small" values. - // In the regression test suite, the limit of 256 avoided allocations in 95% of cases: - // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061 - let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); - value.ser_into(&mut buf)?; - let buf_size: u64 = buf.len().try_into().expect("oversized value buf"); - - let action = self.get_open_layer_action(lsn, buf_size); - let layer = self.handle_open_layer_action(lsn, action, ctx).await?; - let res = layer.put_value(key, lsn, &buf, ctx).await; - - if res.is_ok() { - // Update the current size only when the entire write was ok. - // In case of failures, we may have had partial writes which - // render the size tracking out of sync. That's ok because - // the checkpoint distance should be significantly smaller - // than the S3 single shot upload limit of 5GiB. - let state = self.write_guard.as_mut().unwrap(); - - state.current_size += buf_size; - state.prev_lsn = Some(lsn); - state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn)); - } - - res - } - async fn handle_open_layer_action( &mut self, at: Lsn, @@ -5786,7 +5597,10 @@ impl<'a> TimelineWriter<'a> { } async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> { - let layer = self.tl.get_layer_for_write(at, ctx).await?; + let layer = self + .tl + .get_layer_for_write(at, &self.write_guard, ctx) + .await?; let initial_size = layer.size().await?; let last_freeze_at = self.last_freeze_at.load(); @@ -5799,15 +5613,15 @@ impl<'a> TimelineWriter<'a> { Ok(()) } - async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> { + async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> { let current_size = self.write_guard.as_ref().unwrap().current_size; // self.write_guard will be taken by the freezing self.tl .freeze_inmem_layer_at(freeze_at, &mut self.write_guard) - .await; + .await?; - self.tl.flush_frozen_layers(freeze_at)?; + assert!(self.write_guard.is_none()); if current_size >= self.get_checkpoint_distance() * 2 { warn!("Flushed oversized open layer with size {}", current_size) @@ -5866,18 +5680,58 @@ impl<'a> TimelineWriter<'a> { } /// Put a batch of keys at the specified Lsns. - /// - /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`]. pub(crate) async fn put_batch( &mut self, - batch: VecMap, + batch: Vec<(CompactKey, Lsn, usize, Value)>, ctx: &RequestContext, ) -> anyhow::Result<()> { - for (lsn, (key, val)) in batch { - self.put(key, lsn, &val, ctx).await? + if batch.is_empty() { + return Ok(()); } - Ok(()) + let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch); + let batch_max_lsn = serialized_batch.max_lsn; + let buf_size: u64 = serialized_batch.raw.len() as u64; + + let action = self.get_open_layer_action(batch_max_lsn, buf_size); + let layer = self + .handle_open_layer_action(batch_max_lsn, action, ctx) + .await?; + + let res = layer.put_batch(serialized_batch, ctx).await; + + if res.is_ok() { + // Update the current size only when the entire write was ok. + // In case of failures, we may have had partial writes which + // render the size tracking out of sync. That's ok because + // the checkpoint distance should be significantly smaller + // than the S3 single shot upload limit of 5GiB. + let state = self.write_guard.as_mut().unwrap(); + + state.current_size += buf_size; + state.prev_lsn = Some(batch_max_lsn); + state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn)); + } + + res + } + + #[cfg(test)] + /// Test helper, for tests that would like to poke individual values without composing a batch + pub(crate) async fn put( + &mut self, + key: Key, + lsn: Lsn, + value: &Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + use utils::bin_ser::BeSer; + let val_ser_size = value.serialized_size().unwrap() as usize; + self.put_batch( + vec![(key.to_compact(), lsn, val_ser_size, value.clone())], + ctx, + ) + .await } pub(crate) async fn delete_batch( @@ -5921,16 +5775,115 @@ fn is_send() { #[cfg(test)] mod tests { + use pageserver_api::key::Key; use utils::{id::TimelineId, lsn::Lsn}; - use crate::tenant::{ - harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline, + use crate::{ + repository::Value, + tenant::{ + harness::{test_img, TenantHarness}, + layer_map::LayerMap, + storage_layer::{Layer, LayerName}, + timeline::{DeltaLayerTestDesc, EvictionError}, + Timeline, + }, }; + #[tokio::test] + async fn test_heatmap_generation() { + let harness = TenantHarness::create("heatmap_generation").await.unwrap(); + + let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + Lsn(0x11), + Value::Image(test_img("foo")), + )], + ); + let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![( + Key::from_hex("720000000033333333444444445500000000").unwrap(), + Lsn(0x11), + Value::Image(test_img("foo")), + )], + ); + let l0_delta = DeltaLayerTestDesc::new( + Lsn(0x20)..Lsn(0x30), + Key::from_hex("000000000000000000000000000000000000").unwrap() + ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(), + vec![( + Key::from_hex("720000000033333333444444445500000000").unwrap(), + Lsn(0x25), + Value::Image(test_img("foo")), + )], + ); + let delta_layers = vec![ + covered_delta.clone(), + visible_delta.clone(), + l0_delta.clone(), + ]; + + let image_layer = ( + Lsn(0x40), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + test_img("bar"), + )], + ); + let image_layers = vec![image_layer]; + + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline_with_layers( + TimelineId::generate(), + Lsn(0x10), + 14, + &ctx, + delta_layers, + image_layers, + Lsn(0x100), + ) + .await + .unwrap(); + + // Layer visibility is an input to heatmap generation, so refresh it first + timeline.update_layer_visibility().await.unwrap(); + + let heatmap = timeline + .generate_heatmap() + .await + .expect("Infallible while timeline is not shut down"); + + assert_eq!(heatmap.timeline_id, timeline.timeline_id); + + // L0 should come last + assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name()); + + let mut last_lsn = Lsn::MAX; + for layer in heatmap.layers { + // Covered layer should be omitted + assert!(layer.name != covered_delta.layer_name()); + + let layer_lsn = match &layer.name { + LayerName::Delta(d) => d.lsn_range.end, + LayerName::Image(i) => i.lsn, + }; + + // Apart from L0s, newest Layers should come first + if !LayerMap::is_l0(layer.name.key_range()) { + assert!(layer_lsn <= last_lsn); + last_lsn = layer_lsn; + } + } + } + #[tokio::test] async fn two_layer_eviction_attempts_at_the_same_time() { - let harness = - TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap(); + let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant @@ -5971,6 +5924,7 @@ mod tests { let layers = timeline.layers.read().await; let desc = layers .layer_map() + .unwrap() .iter_historic_layers() .next() .expect("must find one layer to evict"); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 8a95029f33..7370ec1386 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -4,7 +4,7 @@ //! //! The old legacy algorithm is implemented directly in `timeline.rs`. -use std::collections::BinaryHeap; +use std::collections::{BinaryHeap, HashSet}; use std::ops::{Deref, Range}; use std::sync::Arc; @@ -15,26 +15,35 @@ use super::{ }; use anyhow::{anyhow, Context}; +use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; +use pageserver_api::key::KEY_SIZE; use pageserver_api::keyspace::ShardedRange; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; +use serde::Serialize; use tokio_util::sync::CancellationToken; use tracing::{debug, info, info_span, trace, warn, Instrument}; use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; -use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc}; -use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome}; -use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter}; +use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD}; +use crate::tenant::remote_timeline_client::WaitCompletionError; +use crate::tenant::storage_layer::merge_iterator::MergeIterator; +use crate::tenant::storage_layer::{ + AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, +}; +use crate::tenant::timeline::ImageLayerCreationOutcome; +use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::DeltaLayer; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use crate::keyspace::KeySpace; -use crate::repository::Key; +use crate::repository::{Key, Value}; +use crate::walrecord::NeonWalRecord; use utils::lsn::Lsn; @@ -43,14 +52,176 @@ use pageserver_compaction::interface::*; use super::CompactionError; +/// Maximum number of deltas before generating an image layer in bottom-most compaction. +const COMPACTION_DELTA_THRESHOLD: usize = 5; + +/// The result of bottom-most compaction for a single key at each LSN. +#[derive(Debug)] +#[cfg_attr(test, derive(PartialEq))] +pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>); + +/// The result of bottom-most compaction. +#[derive(Debug)] +#[cfg_attr(test, derive(PartialEq))] +pub(crate) struct KeyHistoryRetention { + /// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN. + pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>, + /// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN. + pub(crate) above_horizon: KeyLogAtLsn, +} + +impl KeyHistoryRetention { + async fn pipe_to( + self, + key: Key, + delta_writer: &mut Vec<(Key, Lsn, Value)>, + mut image_writer: Option<&mut ImageLayerWriter>, + stat: &mut CompactionStatistics, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let mut first_batch = true; + for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon { + if first_batch { + if logs.len() == 1 && logs[0].1.is_image() { + let Value::Image(img) = &logs[0].1 else { + unreachable!() + }; + stat.produce_image_key(img); + if let Some(image_writer) = image_writer.as_mut() { + image_writer.put_image(key, img.clone(), ctx).await?; + } else { + delta_writer.push((key, cutoff_lsn, Value::Image(img.clone()))); + } + } else { + for (lsn, val) in logs { + stat.produce_key(&val); + delta_writer.push((key, lsn, val)); + } + } + first_batch = false; + } else { + for (lsn, val) in logs { + stat.produce_key(&val); + delta_writer.push((key, lsn, val)); + } + } + } + let KeyLogAtLsn(above_horizon_logs) = self.above_horizon; + for (lsn, val) in above_horizon_logs { + stat.produce_key(&val); + delta_writer.push((key, lsn, val)); + } + Ok(()) + } +} + +#[derive(Debug, Serialize, Default)] +struct CompactionStatisticsNumSize { + num: u64, + size: u64, +} + +#[derive(Debug, Serialize, Default)] +pub struct CompactionStatistics { + delta_layer_visited: CompactionStatisticsNumSize, + image_layer_visited: CompactionStatisticsNumSize, + delta_layer_produced: CompactionStatisticsNumSize, + image_layer_produced: CompactionStatisticsNumSize, + num_delta_layer_discarded: usize, + num_image_layer_discarded: usize, + num_unique_keys_visited: usize, + wal_keys_visited: CompactionStatisticsNumSize, + image_keys_visited: CompactionStatisticsNumSize, + wal_produced: CompactionStatisticsNumSize, + image_produced: CompactionStatisticsNumSize, +} + +impl CompactionStatistics { + fn estimated_size_of_value(val: &Value) -> usize { + match val { + Value::Image(img) => img.len(), + Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(), + _ => std::mem::size_of::(), + } + } + fn estimated_size_of_key() -> usize { + KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer) + } + fn visit_delta_layer(&mut self, size: u64) { + self.delta_layer_visited.num += 1; + self.delta_layer_visited.size += size; + } + fn visit_image_layer(&mut self, size: u64) { + self.image_layer_visited.num += 1; + self.image_layer_visited.size += size; + } + fn on_unique_key_visited(&mut self) { + self.num_unique_keys_visited += 1; + } + fn visit_wal_key(&mut self, val: &Value) { + self.wal_keys_visited.num += 1; + self.wal_keys_visited.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn visit_image_key(&mut self, val: &Value) { + self.image_keys_visited.num += 1; + self.image_keys_visited.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn produce_key(&mut self, val: &Value) { + match val { + Value::Image(img) => self.produce_image_key(img), + Value::WalRecord(_) => self.produce_wal_key(val), + } + } + fn produce_wal_key(&mut self, val: &Value) { + self.wal_produced.num += 1; + self.wal_produced.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn produce_image_key(&mut self, val: &Bytes) { + self.image_produced.num += 1; + self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64; + } + fn discard_delta_layer(&mut self) { + self.num_delta_layer_discarded += 1; + } + fn discard_image_layer(&mut self) { + self.num_image_layer_discarded += 1; + } + fn produce_delta_layer(&mut self, size: u64) { + self.delta_layer_produced.num += 1; + self.delta_layer_produced.size += size; + } + fn produce_image_layer(&mut self, size: u64) { + self.image_layer_produced.num += 1; + self.image_layer_produced.size += size; + } +} + impl Timeline { /// TODO: cancellation + /// + /// Returns whether the compaction has pending tasks. pub(crate) async fn compact_legacy( self: &Arc, - _cancel: &CancellationToken, + cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { + if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) { + self.compact_with_gc(cancel, flags, ctx) + .await + .map_err(CompactionError::Other)?; + return Ok(false); + } + + if flags.contains(CompactFlags::DryRun) { + return Err(CompactionError::Other(anyhow!( + "dry-run mode is not supported for legacy compaction for now" + ))); + } + // High level strategy for compaction / image creation: // // 1. First, calculate the desired "partitioning" of the @@ -96,7 +267,7 @@ impl Timeline { // Define partitioning schema if needed // FIXME: the match should only cover repartitioning, not the next steps - let partition_count = match self + let (partition_count, has_pending_tasks) = match self .repartition( self.get_last_record_lsn(), self.get_compaction_target_size(), @@ -113,30 +284,35 @@ impl Timeline { // 2. Compact let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(target_file_size, ctx).await?; + let fully_compacted = self.compact_level0(target_file_size, ctx).await?; timer.stop_and_record(); - // 3. Create new image layers for partitions that have been modified - // "enough". let mut partitioning = dense_partitioning; partitioning .parts .extend(sparse_partitioning.into_dense().parts); - let image_layers = self - .create_image_layers( - &partitioning, - lsn, - if flags.contains(CompactFlags::ForceImageLayerCreation) { - ImageLayerCreationMode::Force - } else { - ImageLayerCreationMode::Try - }, - &image_ctx, - ) - .await?; - self.upload_new_image_layers(image_layers)?; - partitioning.parts.len() + // 3. Create new image layers for partitions that have been modified + // "enough". Skip image layer creation if L0 compaction cannot keep up. + if fully_compacted { + let image_layers = self + .create_image_layers( + &partitioning, + lsn, + if flags.contains(CompactFlags::ForceImageLayerCreation) { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, + &image_ctx, + ) + .await?; + + self.upload_new_image_layers(image_layers)?; + } else { + info!("skipping image layer generation due to L0 compaction did not include all layers."); + } + (partitioning.parts.len(), !fully_compacted) } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -148,7 +324,7 @@ impl Timeline { if !self.cancel.is_cancelled() { tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); } - 1 + (1, false) } }; @@ -161,7 +337,7 @@ impl Timeline { self.compact_shard_ancestors(rewrite_max, ctx).await?; } - Ok(()) + Ok(has_pending_tasks) } /// Check for layers that are elegible to be rewritten: @@ -176,7 +352,7 @@ impl Timeline { self: &Arc, rewrite_max: usize, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), CompactionError> { let mut drop_layers = Vec::new(); let mut layers_to_rewrite: Vec = Vec::new(); @@ -191,11 +367,11 @@ impl Timeline { tracing::info!( "latest_gc_cutoff: {}, pitr cutoff {}", *latest_gc_cutoff, - self.gc_info.read().unwrap().cutoffs.pitr + self.gc_info.read().unwrap().cutoffs.time ); let layers = self.layers.read().await; - for layer_desc in layers.layer_map().iter_historic_layers() { + for layer_desc in layers.layer_map()?.iter_historic_layers() { let layer = layers.get_from_desc(&layer_desc); if layer.metadata().shard.shard_count == self.shard_identity.count { // This layer does not belong to a historic ancestor, no need to re-image it. @@ -297,7 +473,8 @@ impl Timeline { layer.layer_desc().image_layer_lsn(), ctx, ) - .await?; + .await + .map_err(CompactionError::Other)?; // Safety of layer rewrites: // - We are writing to a different local file path than we are reading from, so the old Layer @@ -319,7 +496,10 @@ impl Timeline { .await?; if keys_written > 0 { - let new_layer = image_layer_writer.finish(self, ctx).await?; + let new_layer = image_layer_writer + .finish(self, ctx) + .await + .map_err(CompactionError::Other)?; tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", layer.metadata().file_size, new_layer.metadata().file_size); @@ -347,23 +527,72 @@ impl Timeline { // necessary for correctness, but it simplifies testing, and avoids proceeding with another // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O // load. - self.remote_client.wait_completion().await?; + match self.remote_client.wait_completion().await { + Ok(()) => (), + Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), + Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { + return Err(CompactionError::ShuttingDown) + } + } fail::fail_point!("compact-shard-ancestors-persistent"); Ok(()) } + /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is + /// an image layer between them and the most recent readable LSN (branch point or tip of timeline). The + /// purpose of the visibility hint is to record which layers need to be available to service reads. + /// + /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers + /// that we know won't be needed for reads. + pub(super) async fn update_layer_visibility( + &self, + ) -> Result<(), super::layer_manager::Shutdown> { + let head_lsn = self.get_last_record_lsn(); + + // We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas + // are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here. + // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that + // they will be subject to L0->L1 compaction in the near future. + let layer_manager = self.layers.read().await; + let layer_map = layer_manager.layer_map()?; + + let readable_points = { + let children = self.gc_info.read().unwrap().retain_lsns.clone(); + + let mut readable_points = Vec::with_capacity(children.len() + 1); + for (child_lsn, _child_timeline_id) in &children { + readable_points.push(*child_lsn); + } + readable_points.push(head_lsn); + readable_points + }; + + let (layer_visibility, covered) = layer_map.get_visibility(readable_points); + for (layer_desc, visibility) in layer_visibility { + // FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one + let layer = layer_manager.get_from_desc(&layer_desc); + layer.set_visibility(visibility); + } + + // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can + // avoid assuming that everything at a branch point is visible. + drop(covered); + Ok(()) + } + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as - /// as Level 1 files. + /// as Level 1 files. Returns whether the L0 layers are fully compacted. async fn compact_level0( self: &Arc, target_file_size: u64, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { let CompactLevel0Phase1Result { new_layers, deltas_to_compact, + fully_compacted, } = { let phase1_span = info_span!("compact_level0_phase1"); let ctx = ctx.attached_child(); @@ -375,7 +604,7 @@ impl Timeline { }; let begin = tokio::time::Instant::now(); - let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await; + let phase1_layers_locked = self.layers.read().await; let now = tokio::time::Instant::now(); stats.read_lock_acquisition_micros = DurationRecorder::Recorded(RecordedDuration(now - begin), now); @@ -386,31 +615,28 @@ impl Timeline { if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do - return Ok(()); + return Ok(true); } self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact) .await?; - Ok(()) + Ok(fully_compacted) } /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. - async fn compact_level0_phase1( - self: &Arc, - guard: tokio::sync::OwnedRwLockReadGuard, + async fn compact_level0_phase1<'a>( + self: &'a Arc, + guard: tokio::sync::RwLockReadGuard<'a, LayerManager>, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, ctx: &RequestContext, ) -> Result { stats.read_lock_held_spawn_blocking_startup_micros = stats.read_lock_acquisition_micros.till_now(); // set by caller - let layers = guard.layer_map(); - let level0_deltas = layers.get_level0_deltas()?; - let mut level0_deltas = level0_deltas - .into_iter() - .map(|x| guard.get_from_desc(&x)) - .collect_vec(); + let layers = guard.layer_map()?; + let level0_deltas = layers.level0_deltas(); stats.level0_deltas_count = Some(level0_deltas.len()); + // Only compact if enough layers have accumulated. let threshold = self.get_compaction_threshold(); if level0_deltas.is_empty() || level0_deltas.len() < threshold { @@ -421,6 +647,11 @@ impl Timeline { return Ok(CompactLevel0Phase1Result::default()); } + let mut level0_deltas = level0_deltas + .iter() + .map(|x| guard.get_from_desc(x)) + .collect::>(); + // Gather the files to compact in this iteration. // // Start with the oldest Level 0 delta file, and collect any other @@ -441,6 +672,24 @@ impl Timeline { let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); + // Accumulate the size of layers in `deltas_to_compact` + let mut deltas_to_compact_bytes = 0; + + // Under normal circumstances, we will accumulate up to compaction_interval L0s of size + // checkpoint_distance each. To avoid edge cases using extra system resources, bound our + // work in this function to only operate on this much delta data at once. + // + // Take the max of the configured value & the default, so that tests that configure tiny values + // can still use a sensible amount of memory, but if a deployed system configures bigger values we + // still let them compact a full stack of L0s in one go. + let delta_size_limit = std::cmp::max( + self.get_compaction_threshold(), + DEFAULT_COMPACTION_THRESHOLD, + ) as u64 + * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE); + + let mut fully_compacted = true; + deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; @@ -449,7 +698,21 @@ impl Timeline { break; } deltas_to_compact.push(l.download_and_keep_resident().await?); + deltas_to_compact_bytes += l.metadata().file_size; prev_lsn_end = lsn_range.end; + + if deltas_to_compact_bytes >= delta_size_limit { + info!( + l0_deltas_selected = deltas_to_compact.len(), + l0_deltas_total = level0_deltas.len(), + "L0 compaction picker hit max delta layer size limit: {}", + delta_size_limit + ); + fully_compacted = false; + + // Proceed with compaction, but only a subset of L0s + break; + } } let lsn_range = Range { start: deltas_to_compact @@ -481,66 +744,238 @@ impl Timeline { .read_lock_held_spawn_blocking_startup_micros .till_now(); - // Determine N largest holes where N is number of compacted layers. - let max_holes = deltas_to_compact.len(); - let last_record_lsn = self.get_last_record_lsn(); - let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; - let min_hole_coverage_size = 3; // TODO: something more flexible? - - // min-heap (reserve space for one more element added before eviction) - let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); - let mut prev: Option = None; - - let mut all_keys = Vec::new(); - - for l in deltas_to_compact.iter() { - all_keys.extend(l.load_keys(ctx).await?); - } - - // FIXME: should spawn_blocking the rest of this function - - // The current stdlib sorting implementation is designed in a way where it is - // particularly fast where the slice is made up of sorted sub-ranges. - all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); + // TODO: replace with streaming k-merge + let all_keys = { + let mut all_keys = Vec::new(); + for l in deltas_to_compact.iter() { + if self.cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } + all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?); + } + // The current stdlib sorting implementation is designed in a way where it is + // particularly fast where the slice is made up of sorted sub-ranges. + all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); + all_keys + }; stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); - for &DeltaEntry { key: next_key, .. } in all_keys.iter() { - if let Some(prev_key) = prev { - // just first fast filter, do not create hole entries for metadata keys. The last hole in the - // compaction is the gap between data key and metadata keys. - if next_key.to_i128() - prev_key.to_i128() >= min_hole_range - && !Key::is_metadata_key(&prev_key) - { - let key_range = prev_key..next_key; - // Measuring hole by just subtraction of i128 representation of key range boundaries - // has not so much sense, because largest holes will corresponds field1/field2 changes. - // But we are mostly interested to eliminate holes which cause generation of excessive image layers. - // That is why it is better to measure size of hole as number of covering image layers. - let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len(); - if coverage_size >= min_hole_coverage_size { - heap.push(Hole { - key_range, - coverage_size, - }); - if heap.len() > max_holes { - heap.pop(); // remove smallest hole + // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start. + // + // A hole is a key range for which this compaction doesn't have any WAL records. + // Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range, + // cover the hole, but actually don't contain any WAL records for that key range. + // The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`). + // That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records. + // + // The algorithm chooses holes as follows. + // - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys). + // - Filter: min threshold on range length + // - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data) + // + // For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451 + #[derive(PartialEq, Eq)] + struct Hole { + key_range: Range, + coverage_size: usize, + } + let holes: Vec = { + use std::cmp::Ordering; + impl Ord for Hole { + fn cmp(&self, other: &Self) -> Ordering { + self.coverage_size.cmp(&other.coverage_size).reverse() + } + } + impl PartialOrd for Hole { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + let max_holes = deltas_to_compact.len(); + let last_record_lsn = self.get_last_record_lsn(); + let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; + let min_hole_coverage_size = 3; // TODO: something more flexible? + // min-heap (reserve space for one more element added before eviction) + let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); + let mut prev: Option = None; + + for &DeltaEntry { key: next_key, .. } in all_keys.iter() { + if let Some(prev_key) = prev { + // just first fast filter, do not create hole entries for metadata keys. The last hole in the + // compaction is the gap between data key and metadata keys. + if next_key.to_i128() - prev_key.to_i128() >= min_hole_range + && !Key::is_metadata_key(&prev_key) + { + let key_range = prev_key..next_key; + // Measuring hole by just subtraction of i128 representation of key range boundaries + // has not so much sense, because largest holes will corresponds field1/field2 changes. + // But we are mostly interested to eliminate holes which cause generation of excessive image layers. + // That is why it is better to measure size of hole as number of covering image layers. + let coverage_size = + layers.image_coverage(&key_range, last_record_lsn).len(); + if coverage_size >= min_hole_coverage_size { + heap.push(Hole { + key_range, + coverage_size, + }); + if heap.len() > max_holes { + heap.pop(); // remove smallest hole + } } } } + prev = Some(next_key.next()); } - prev = Some(next_key.next()); - } + let mut holes = heap.into_vec(); + holes.sort_unstable_by_key(|hole| hole.key_range.start); + holes + }; stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); drop_rlock(guard); + + if self.cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } + stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); - let mut holes = heap.into_vec(); - holes.sort_unstable_by_key(|hole| hole.key_range.start); - let mut next_hole = 0; // index of next hole in holes vector // This iterator walks through all key-value pairs from all the layers // we're compacting, in key, LSN order. - let all_values_iter = all_keys.iter(); + // If there's both a Value::Image and Value::WalRecord for the same (key,lsn), + // then the Value::Image is ordered before Value::WalRecord. + // + // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io + // option and validation code once we've reached confidence. + enum AllValuesIter<'a> { + PageCachedBlobIo { + all_keys_iter: VecIter<'a>, + }, + StreamingKmergeBypassingPageCache { + merge_iter: MergeIterator<'a>, + }, + ValidatingStreamingKmergeBypassingPageCache { + mode: CompactL0BypassPageCacheValidation, + merge_iter: MergeIterator<'a>, + all_keys_iter: VecIter<'a>, + }, + } + type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes + impl AllValuesIter<'_> { + async fn next_all_keys_iter( + iter: &mut VecIter<'_>, + ctx: &RequestContext, + ) -> anyhow::Result> { + let Some(DeltaEntry { + key, + lsn, + val: value_ref, + .. + }) = iter.next() + else { + return Ok(None); + }; + let value = value_ref.load(ctx).await?; + Ok(Some((*key, *lsn, value))) + } + async fn next( + &mut self, + ctx: &RequestContext, + ) -> anyhow::Result> { + match self { + AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => { + Self::next_all_keys_iter(iter, ctx).await + } + AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await, + AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async { + // advance both iterators + let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await; + let merge_iter_item = merge_iter.next().await; + // compare results & log warnings as needed + macro_rules! rate_limited_warn { + ($($arg:tt)*) => {{ + if cfg!(debug_assertions) || cfg!(feature = "testing") { + warn!($($arg)*); + panic!("CompactL0BypassPageCacheValidation failure, check logs"); + } + use once_cell::sync::Lazy; + use utils::rate_limit::RateLimit; + use std::sync::Mutex; + use std::time::Duration; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!($($arg)*); + }); + }} + } + match (&all_keys_iter_item, &merge_iter_item) { + (Err(_), Err(_)) => { + // don't bother asserting equivality of the errors + } + (Err(all_keys), Ok(merge)) => { + rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}"); + }, + (Ok(all_keys), Err(merge)) => { + rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}"); + }, + (Ok(None), Ok(None)) => { } + (Ok(Some(all_keys)), Ok(None)) => { + rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some"); + } + (Ok(None), Ok(Some(merge))) => { + rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some"); + } + (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => { + match mode { + // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one + CompactL0BypassPageCacheValidation::KeyLsn => { + let all_keys = (all_keys_key, all_keys_lsn); + let merge = (merge_key, merge_lsn); + if all_keys != merge { + rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter"); + } + } + CompactL0BypassPageCacheValidation::KeyLsnValue => { + let all_keys = (all_keys_key, all_keys_lsn, all_keys_value); + let merge = (merge_key, merge_lsn, merge_value); + if all_keys != merge { + rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter"); + } + } + } + } + } + // in case of mismatch, trust the legacy all_keys_iter_item + all_keys_iter_item + }.instrument(info_span!("next")).await + } + } + } + let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access { + CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo { + all_keys_iter: all_keys.iter(), + }, + CompactL0Phase1ValueAccess::StreamingKmerge { validate } => { + let merge_iter = { + let mut deltas = Vec::with_capacity(deltas_to_compact.len()); + for l in deltas_to_compact.iter() { + let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; + deltas.push(l); + } + MergeIterator::create(&deltas, &[], ctx) + }; + match validate { + None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter }, + Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { + mode: validate.clone(), + merge_iter, + all_keys_iter: all_keys.iter(), + }, + } + } + }; // This iterator walks through all keys and is needed to calculate size used by each key let mut all_keys_iter = all_keys @@ -611,12 +1046,24 @@ impl Timeline { let mut key_values_total_size = 0u64; let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key + let mut next_hole = 0; // index of next hole in holes vector - for &DeltaEntry { - key, lsn, ref val, .. - } in all_values_iter + let mut keys = 0; + + while let Some((key, lsn, value)) = all_values_iter + .next(ctx) + .await + .map_err(CompactionError::Other)? { - let value = val.load(ctx).await?; + keys += 1; + + if keys % 32_768 == 0 && self.cancel.is_cancelled() { + // avoid hitting the cancellation token on every key. in benches, we end up + // shuffling an order of million keys per layer, this means we'll check it + // around tens of times per layer. + return Err(CompactionError::ShuttingDown); + } + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); // We need to check key boundaries once we reach next key or end of layer with the same key if !same_key || lsn == dup_end_lsn { @@ -668,13 +1115,16 @@ impl Timeline { || contains_hole { // ... if so, flush previous layer and prepare to write new one - new_layers.push( - writer - .take() - .unwrap() - .finish(prev_key.unwrap().next(), self, ctx) - .await?, - ); + let (desc, path) = writer + .take() + .unwrap() + .finish(prev_key.unwrap().next(), ctx) + .await + .map_err(CompactionError::Other)?; + let new_delta = Layer::finish_creating(self.conf, self, desc, &path) + .map_err(CompactionError::Other)?; + + new_layers.push(new_delta); writer = None; if contains_hole { @@ -694,6 +1144,10 @@ impl Timeline { if !self.shard_identity.is_key_disposable(&key) { if writer.is_none() { + if self.cancel.is_cancelled() { + // to be somewhat responsive to cancellation, check for each new layer + return Err(CompactionError::ShuttingDown); + } // Create writer if not initiaized yet writer = Some( DeltaLayerWriter::new( @@ -711,15 +1165,19 @@ impl Timeline { }, ctx, ) - .await?, + .await + .map_err(CompactionError::Other)?, ); + + keys = 0; } writer .as_mut() .unwrap() .put_value(key, lsn, value, ctx) - .await?; + .await + .map_err(CompactionError::Other)?; } else { debug!( "Dropping key {} during compaction (it belongs on shard {:?})", @@ -735,7 +1193,13 @@ impl Timeline { prev_key = Some(key); } if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?); + let (desc, path) = writer + .finish(prev_key.unwrap().next(), ctx) + .await + .map_err(CompactionError::Other)?; + let new_delta = Layer::finish_creating(self.conf, self, desc, &path) + .map_err(CompactionError::Other)?; + new_layers.push(new_delta); } // Sync layers @@ -793,12 +1257,17 @@ impl Timeline { } } + // Without this, rustc complains about deltas_to_compact still + // being borrowed when we `.into_iter()` below. + drop(all_values_iter); + Ok(CompactLevel0Phase1Result { new_layers, deltas_to_compact: deltas_to_compact .into_iter() .map(|x| x.drop_eviction_guard()) .collect::>(), + fully_compacted, }) } } @@ -807,6 +1276,9 @@ impl Timeline { struct CompactLevel0Phase1Result { new_layers: Vec, deltas_to_compact: Vec, + // Whether we have included all L0 layers, or selected only part of them due to the + // L0 compaction size limit. + fully_compacted: bool, } #[derive(Default)] @@ -896,6 +1368,43 @@ impl TryFrom for CompactLevel0Phase1Stats { } } +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +pub enum CompactL0Phase1ValueAccess { + /// The old way. + PageCachedBlobIo, + /// The new way. + StreamingKmerge { + /// If set, we run both the old way and the new way, validate that + /// they are identical (=> [`CompactL0BypassPageCacheValidation`]), + /// and if the validation fails, + /// - in tests: fail them with a panic or + /// - in prod, log a rate-limited warning and use the old way's results. + /// + /// If not set, we only run the new way and trust its results. + validate: Option, + }, +} + +/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`]. +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(rename_all = "kebab-case")] +pub enum CompactL0BypassPageCacheValidation { + /// Validate that the series of (key, lsn) pairs are the same. + KeyLsn, + /// Validate that the entire output of old and new way is identical. + KeyLsnValue, +} + +impl Default for CompactL0Phase1ValueAccess { + fn default() -> Self { + CompactL0Phase1ValueAccess::StreamingKmerge { + // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident + validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue), + } + } +} + impl Timeline { /// Entry point for new tiered compaction algorithm. /// @@ -915,10 +1424,9 @@ impl Timeline { // Find the top of the historical layers let end_lsn = { let guard = self.layers.read().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; - let l0_deltas = layers.get_level0_deltas()?; - drop(guard); + let l0_deltas = layers.level0_deltas(); // As an optimization, if we find that there are too few L0 layers, // bail out early. We know that the compaction algorithm would do @@ -947,181 +1455,792 @@ impl Timeline { fanout, ctx, ) - .await?; + .await + // TODO: compact_tiered needs to return CompactionError + .map_err(CompactionError::Other)?; adaptor.flush_updates().await?; Ok(()) } + /// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns. + /// + /// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon. + /// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is + /// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch. + /// + /// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have: + /// + /// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60 + /// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3 + /// + /// The function will produce: + /// + /// ```plain + /// 0x20(retain_lsn) -> img=AB@0x20 always produce a single image below the lowest retain LSN + /// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40] two deltas since the last base image, keeping the deltas + /// 0x50(horizon) -> deltas=[ABCDE@0x50] three deltas since the last base image, generate an image but put it in the delta + /// above_horizon -> deltas=[+F@0x60] full history above the horizon + /// ``` + /// + /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key. + pub(crate) async fn generate_key_retention( + self: &Arc, + key: Key, + full_history: &[(Key, Lsn, Value)], + horizon: Lsn, + retain_lsn_below_horizon: &[Lsn], + delta_threshold_cnt: usize, + base_img_from_ancestor: Option<(Key, Lsn, Bytes)>, + ) -> anyhow::Result { + // Pre-checks for the invariants + if cfg!(debug_assertions) { + for (log_key, _, _) in full_history { + assert_eq!(log_key, &key, "mismatched key"); + } + for i in 1..full_history.len() { + assert!(full_history[i - 1].1 <= full_history[i].1, "unordered LSN"); + if full_history[i - 1].1 == full_history[i].1 { + assert!( + matches!(full_history[i - 1].2, Value::Image(_)), + "unordered delta/image, or duplicated delta" + ); + } + } + // There was an assertion for no base image that checks if the first + // record in the history is `will_init` before, but it was removed. + // This is explained in the test cases for generate_key_retention. + // Search "incomplete history" for more information. + for lsn in retain_lsn_below_horizon { + assert!(lsn < &horizon, "retain lsn must be below horizon") + } + for i in 1..retain_lsn_below_horizon.len() { + assert!( + retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i], + "unordered LSN" + ); + } + } + let has_ancestor = base_img_from_ancestor.is_some(); + // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon, + // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket. + let (mut split_history, lsn_split_points) = { + let mut split_history = Vec::new(); + split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new); + let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1); + for lsn in retain_lsn_below_horizon { + lsn_split_points.push(*lsn); + } + lsn_split_points.push(horizon); + let mut current_idx = 0; + for item @ (_, lsn, _) in full_history { + while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] { + current_idx += 1; + } + split_history[current_idx].push(item); + } + (split_history, lsn_split_points) + }; + // Step 2: filter out duplicated records due to the k-merge of image/delta layers + for split_for_lsn in &mut split_history { + let mut prev_lsn = None; + let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len()); + for record @ (_, lsn, _) in std::mem::take(split_for_lsn) { + if let Some(prev_lsn) = &prev_lsn { + if *prev_lsn == lsn { + // The case that we have an LSN with both data from the delta layer and the image layer. As + // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply + // drop this delta and keep the image. + // + // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will + // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply + // dropped. + // + // TODO: in case we have both delta + images for a given LSN and it does not exceed the delta + // threshold, we could have kept delta instead to save space. This is an optimization for the future. + continue; + } + } + prev_lsn = Some(lsn); + new_split_for_lsn.push(record); + } + *split_for_lsn = new_split_for_lsn; + } + // Step 3: generate images when necessary + let mut retention = Vec::with_capacity(split_history.len()); + let mut records_since_last_image = 0; + let batch_cnt = split_history.len(); + assert!( + batch_cnt >= 2, + "should have at least below + above horizon batches" + ); + let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new(); + if let Some((key, lsn, img)) = base_img_from_ancestor { + replay_history.push((key, lsn, Value::Image(img))); + } + + /// Generate debug information for the replay history + fn generate_history_trace(replay_history: &[(Key, Lsn, Value)]) -> String { + use std::fmt::Write; + let mut output = String::new(); + if let Some((key, _, _)) = replay_history.first() { + write!(output, "key={} ", key).unwrap(); + let mut cnt = 0; + for (_, lsn, val) in replay_history { + if val.is_image() { + write!(output, "i@{} ", lsn).unwrap(); + } else if val.will_init() { + write!(output, "di@{} ", lsn).unwrap(); + } else { + write!(output, "d@{} ", lsn).unwrap(); + } + cnt += 1; + if cnt >= 128 { + write!(output, "... and more").unwrap(); + break; + } + } + } else { + write!(output, "").unwrap(); + } + output + } + + fn generate_debug_trace( + replay_history: Option<&[(Key, Lsn, Value)]>, + full_history: &[(Key, Lsn, Value)], + lsns: &[Lsn], + horizon: Lsn, + ) -> String { + use std::fmt::Write; + let mut output = String::new(); + if let Some(replay_history) = replay_history { + writeln!( + output, + "replay_history: {}", + generate_history_trace(replay_history) + ) + .unwrap(); + } else { + writeln!(output, "replay_history: ",).unwrap(); + } + writeln!( + output, + "full_history: {}", + generate_history_trace(full_history) + ) + .unwrap(); + writeln!( + output, + "when processing: [{}] horizon={}", + lsns.iter().map(|l| format!("{l}")).join(","), + horizon + ) + .unwrap(); + output + } + + for (i, split_for_lsn) in split_history.into_iter().enumerate() { + // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly. + records_since_last_image += split_for_lsn.len(); + let generate_image = if i == 0 && !has_ancestor { + // We always generate images for the first batch (below horizon / lowest retain_lsn) + true + } else if i == batch_cnt - 1 { + // Do not generate images for the last batch (above horizon) + false + } else if records_since_last_image >= delta_threshold_cnt { + // Generate images when there are too many records + true + } else { + false + }; + replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone())); + // Only retain the items after the last image record + for idx in (0..replay_history.len()).rev() { + if replay_history[idx].2.will_init() { + replay_history = replay_history[idx..].to_vec(); + break; + } + } + if let Some((_, _, val)) = replay_history.first() { + if !val.will_init() { + return Err(anyhow::anyhow!("invalid history, no base image")).with_context( + || { + generate_debug_trace( + Some(&replay_history), + full_history, + retain_lsn_below_horizon, + horizon, + ) + }, + ); + } + } + if generate_image && records_since_last_image > 0 { + records_since_last_image = 0; + let replay_history_for_debug = if cfg!(debug_assertions) { + Some(replay_history.clone()) + } else { + None + }; + let replay_history_for_debug_ref = replay_history_for_debug.as_deref(); + let history = std::mem::take(&mut replay_history); + let mut img = None; + let mut records = Vec::with_capacity(history.len()); + if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() { + img = Some((*lsn, val.clone())); + for (_, lsn, val) in history.into_iter().skip(1) { + let Value::WalRecord(rec) = val else { + return Err(anyhow::anyhow!( + "invalid record, first record is image, expect walrecords" + )) + .with_context(|| { + generate_debug_trace( + replay_history_for_debug_ref, + full_history, + retain_lsn_below_horizon, + horizon, + ) + }); + }; + records.push((lsn, rec)); + } + } else { + for (_, lsn, val) in history.into_iter() { + let Value::WalRecord(rec) = val else { + return Err(anyhow::anyhow!("invalid record, first record is walrecord, expect rest are walrecord")) + .with_context(|| generate_debug_trace( + replay_history_for_debug_ref, + full_history, + retain_lsn_below_horizon, + horizon, + )); + }; + records.push((lsn, rec)); + } + } + records.reverse(); + let state = ValueReconstructState { img, records }; + let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range + let img = self.reconstruct_value(key, request_lsn, state).await?; + replay_history.push((key, request_lsn, Value::Image(img.clone()))); + retention.push(vec![(request_lsn, Value::Image(img))]); + } else { + let deltas = split_for_lsn + .iter() + .map(|(_, lsn, value)| (*lsn, value.clone())) + .collect_vec(); + retention.push(deltas); + } + } + let mut result = Vec::with_capacity(retention.len()); + assert_eq!(retention.len(), lsn_split_points.len() + 1); + for (idx, logs) in retention.into_iter().enumerate() { + if idx == lsn_split_points.len() { + return Ok(KeyHistoryRetention { + below_horizon: result, + above_horizon: KeyLogAtLsn(logs), + }); + } else { + result.push((lsn_split_points[idx], KeyLogAtLsn(logs))); + } + } + unreachable!("key retention is empty") + } + /// An experimental compaction building block that combines compaction with garbage collection. /// /// The current implementation picks all delta + image layers that are below or intersecting with /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon, /// and create delta layers with all deltas >= gc horizon. - #[cfg(test)] pub(crate) async fn compact_with_gc( self: &Arc, - _cancel: &CancellationToken, + cancel: &CancellationToken, + flags: EnumSet, ctx: &RequestContext, - ) -> Result<(), CompactionError> { - use crate::tenant::storage_layer::ValueReconstructState; + ) -> anyhow::Result<()> { + use std::collections::BTreeSet; + + // Block other compaction/GC tasks from running for now. GC-compaction could run along + // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. + // Note that we already acquired the compaction lock when the outer `compact` function gets called. + + let gc_lock = async { + tokio::select! { + guard = self.gc_lock.lock() => Ok(guard), + // TODO: refactor to CompactionError to correctly pass cancelled error + _ = cancel.cancelled() => Err(anyhow!("cancelled")), + } + }; + + let gc_lock = crate::timed( + gc_lock, + "acquires gc lock", + std::time::Duration::from_secs(5), + ) + .await?; + + let dry_run = flags.contains(CompactFlags::DryRun); + + info!("running enhanced gc bottom-most compaction, dry_run={dry_run}"); + + scopeguard::defer! { + info!("done enhanced gc bottom-most compaction"); + }; + + let mut stat = CompactionStatistics::default(); + // Step 0: pick all delta layers + image layers below/intersect with the GC horizon. // The layer selection has the following properties: // 1. If a layer is in the selection, all layers below it are in the selection. // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. - let (layer_selection, gc_cutoff) = { + let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = { let guard = self.layers.read().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; let gc_info = self.gc_info.read().unwrap(); - let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr); + let mut retain_lsns_below_horizon = Vec::new(); + let gc_cutoff = gc_info.cutoffs.select_min(); + for (lsn, _timeline_id) in &gc_info.retain_lsns { + if lsn < &gc_cutoff { + retain_lsns_below_horizon.push(*lsn); + } + } + for lsn in gc_info.leases.keys() { + if lsn < &gc_cutoff { + retain_lsns_below_horizon.push(*lsn); + } + } let mut selected_layers = Vec::new(); - // TODO: consider retain_lsns drop(gc_info); for desc in layers.iter_historic_layers() { if desc.get_lsn_range().start <= gc_cutoff { selected_layers.push(guard.get_from_desc(&desc)); } } - (selected_layers, gc_cutoff) + retain_lsns_below_horizon.sort(); + (selected_layers, gc_cutoff, retain_lsns_below_horizon) }; + let lowest_retain_lsn = if self.ancestor_timeline.is_some() { + Lsn(self.ancestor_lsn.0 + 1) + } else { + let res = retain_lsns_below_horizon + .first() + .copied() + .unwrap_or(gc_cutoff); + if cfg!(debug_assertions) { + assert_eq!( + res, + retain_lsns_below_horizon + .iter() + .min() + .copied() + .unwrap_or(gc_cutoff) + ); + } + res + }; + info!( + "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}", + layer_selection.len(), + gc_cutoff, + lowest_retain_lsn + ); // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. - let mut all_key_values = Vec::new(); + // Also, collect the layer information to decide when to split the new delta layers. + let mut downloaded_layers = Vec::new(); + let mut delta_split_points = BTreeSet::new(); for layer in &layer_selection { - all_key_values.extend(layer.load_key_values(ctx).await?); - } - // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and - // image layers, make image appear later than delta. - struct ValueWrapper<'a>(&'a crate::repository::Value); - impl Ord for ValueWrapper<'_> { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - use crate::repository::Value; - use std::cmp::Ordering; - match (self.0, other.0) { - (Value::Image(_), Value::WalRecord(_)) => Ordering::Greater, - (Value::WalRecord(_), Value::Image(_)) => Ordering::Less, - _ => Ordering::Equal, - } + let resident_layer = layer.download_and_keep_resident().await?; + downloaded_layers.push(resident_layer); + + let desc = layer.layer_desc(); + if desc.is_delta() { + // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon) + // so that we can avoid having too many small delta layers. + let key_range = desc.get_key_range(); + delta_split_points.insert(key_range.start); + delta_split_points.insert(key_range.end); + stat.visit_delta_layer(desc.file_size()); + } else { + stat.visit_image_layer(desc.file_size()); } } - impl PartialOrd for ValueWrapper<'_> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) + let mut delta_layers = Vec::new(); + let mut image_layers = Vec::new(); + for resident_layer in &downloaded_layers { + if resident_layer.layer_desc().is_delta() { + let layer = resident_layer.get_as_delta(ctx).await?; + delta_layers.push(layer); + } else { + let layer = resident_layer.get_as_image(ctx).await?; + image_layers.push(layer); } } - impl PartialEq for ValueWrapper<'_> { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == std::cmp::Ordering::Equal - } - } - impl Eq for ValueWrapper<'_> {} - all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| { - (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2))) - }); - let max_lsn = all_key_values - .iter() - .map(|(_, lsn, _)| lsn) - .max() - .copied() - .unwrap() - + 1; + let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx); // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas. // Data of the same key. let mut accumulated_values = Vec::new(); - let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty + let mut last_key: Option = None; - /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon. - async fn flush_accumulated_states( + enum FlushDeltaResult { + /// Create a new resident layer + CreateResidentLayer(ResidentLayer), + /// Keep an original delta layer + KeepLayer(PersistentLayerKey), + } + + #[allow(clippy::too_many_arguments)] + async fn flush_deltas( + deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>, + last_key: Key, + delta_split_points: &[Key], + current_delta_split_point: &mut usize, tline: &Arc, - key: Key, - accumulated_values: &[&(Key, Lsn, crate::repository::Value)], - horizon: Lsn, - ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> { - let mut base_image = None; - let mut keys_above_horizon = Vec::new(); - let mut delta_above_base_image = Vec::new(); - // We have a list of deltas/images. We want to create image layers while collect garbages. - for (key, lsn, val) in accumulated_values.iter().rev() { - if *lsn > horizon { - keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both - } else if *lsn <= horizon { - match val { - crate::repository::Value::Image(image) => { - if lsn <= &horizon { - base_image = Some((*lsn, image.clone())); - break; - } - } - crate::repository::Value::WalRecord(wal) => { - delta_above_base_image.push((*lsn, wal.clone())); - } + lowest_retain_lsn: Lsn, + ctx: &RequestContext, + stats: &mut CompactionStatistics, + dry_run: bool, + last_batch: bool, + ) -> anyhow::Result> { + // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid + // overlapping layers. + // + // If we have a structure like this: + // + // | Delta 1 | | Delta 4 | + // |---------| Delta 2 |---------| + // | Delta 3 | | Delta 5 | + // + // And we choose to compact delta 2+3+5. We will get an overlapping delta layer with delta 1+4. + // A simple solution here is to split the delta layers using the original boundary, while this + // might produce a lot of small layers. This should be improved and fixed in the future. + let mut need_split = false; + while *current_delta_split_point < delta_split_points.len() + && last_key >= delta_split_points[*current_delta_split_point] + { + *current_delta_split_point += 1; + need_split = true; + } + if !need_split && !last_batch { + return Ok(None); + } + let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas); + if deltas.is_empty() { + return Ok(None); + } + let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1; + let delta_key = PersistentLayerKey { + key_range: { + let key_start = deltas.first().unwrap().0; + let key_end = deltas.last().unwrap().0.next(); + key_start..key_end + }, + lsn_range: lowest_retain_lsn..end_lsn, + is_delta: true, + }; + { + // Hack: skip delta layer if we need to produce a layer of a same key-lsn. + // + // This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range. + // For example, consider the case where a single delta with range [0x10,0x50) exists. + // And we have branches at LSN 0x10, 0x20, 0x30. + // Then we delete branch @ 0x20. + // Bottom-most compaction may now delete the delta [0x20,0x30). + // And that wouldnt' change the shape of the layer. + // + // Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes. + // That's why it's safe to skip. + let guard = tline.layers.read().await; + + if guard.contains_key(&delta_key) { + let layer_generation = guard.get_from_key(&delta_key).metadata().generation; + drop(guard); + if layer_generation == tline.generation { + stats.discard_delta_layer(); + // TODO: depending on whether we design this compaction process to run along with + // other compactions, there could be layer map modifications after we drop the + // layer guard, and in case it creates duplicated layer key, we will still error + // in the end. + info!( + key=%delta_key, + ?layer_generation, + "discard delta layer due to duplicated layer in the same generation" + ); + return Ok(Some(FlushDeltaResult::KeepLayer(delta_key))); } } } - delta_above_base_image.reverse(); - keys_above_horizon.reverse(); - let state = ValueReconstructState { - img: base_image, - records: delta_above_base_image, - }; - let img = tline.reconstruct_value(key, horizon, state).await?; - Ok((keys_above_horizon, img)) + + let mut delta_layer_writer = DeltaLayerWriter::new( + tline.conf, + tline.timeline_id, + tline.tenant_shard_id, + delta_key.key_range.start, + lowest_retain_lsn..end_lsn, + ctx, + ) + .await?; + for (key, lsn, val) in deltas { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + + stats.produce_delta_layer(delta_layer_writer.size()); + if dry_run { + return Ok(None); + } + + let (desc, path) = delta_layer_writer + .finish(delta_key.key_range.end, ctx) + .await?; + let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?; + Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer))) } - let mut delta_layer_writer = DeltaLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - all_key_values.first().unwrap().0, - gc_cutoff..max_lsn, // TODO: off by one? - ctx, - ) - .await?; - let mut image_layer_writer = ImageLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()), - gc_cutoff, - ctx, - ) - .await?; + // Hack the key range to be min..(max-1). Otherwise, the image layer will be + // interpreted as an L0 delta layer. + let hack_image_layer_range = { + let mut end_key = Key::MAX; + end_key.field6 -= 1; + Key::MIN..end_key + }; - for item @ (key, _, _) in &all_key_values { - if &last_key == key { - accumulated_values.push(item); - } else { - let (deltas, image) = - flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff) - .await?; - image_layer_writer.put_image(last_key, image, ctx).await?; - for (key, lsn, val) in deltas { - delta_layer_writer.put_value(key, lsn, val, ctx).await?; + // Only create image layers when there is no ancestor branches. TODO: create covering image layer + // when some condition meet. + let mut image_layer_writer = if self.ancestor_timeline.is_none() { + Some( + ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &hack_image_layer_range, // covers the full key range + lowest_retain_lsn, + ctx, + ) + .await?, + ) + } else { + None + }; + + /// Returns None if there is no ancestor branch. Throw an error when the key is not found. + /// + /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image + /// is needed for reconstruction. This should be fixed in the future. + /// + /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor + /// images. + async fn get_ancestor_image( + tline: &Arc, + key: Key, + ctx: &RequestContext, + ) -> anyhow::Result> { + if tline.ancestor_timeline.is_none() { + return Ok(None); + }; + // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing + // as much existing code as possible. + let img = tline.get(key, tline.ancestor_lsn, ctx).await?; + Ok(Some((key, tline.ancestor_lsn, img))) + } + let image_layer_key = PersistentLayerKey { + key_range: hack_image_layer_range, + lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn), + is_delta: false, + }; + + // Like with delta layers, it can happen that we re-produce an already existing image layer. + // This could happen when a user triggers force compaction and image generation. In this case, + // it's always safe to rewrite the layer. + let discard_image_layer = { + let guard = self.layers.read().await; + if guard.contains_key(&image_layer_key) { + let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation; + drop(guard); + if layer_generation == self.generation { + // TODO: depending on whether we design this compaction process to run along with + // other compactions, there could be layer map modifications after we drop the + // layer guard, and in case it creates duplicated layer key, we will still error + // in the end. + info!( + key=%image_layer_key, + ?layer_generation, + "discard image layer due to duplicated layer key in the same generation", + ); + true + } else { + false } + } else { + false + } + }; + + // Actually, we can decide not to write to the image layer at all at this point because + // the key and LSN range are determined. However, to keep things simple here, we still + // create this writer, and discard the writer in the end. + + let mut delta_values = Vec::new(); + let delta_split_points = delta_split_points.into_iter().collect_vec(); + let mut current_delta_split_point = 0; + let mut delta_layers = Vec::new(); + while let Some((key, lsn, val)) = merge_iter.next().await? { + if cancel.is_cancelled() { + return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error + } + match val { + Value::Image(_) => stat.visit_image_key(&val), + Value::WalRecord(_) => stat.visit_wal_key(&val), + } + if last_key.is_none() || last_key.as_ref() == Some(&key) { + if last_key.is_none() { + last_key = Some(key); + } + accumulated_values.push((key, lsn, val)); + } else { + let last_key = last_key.as_mut().unwrap(); + stat.on_unique_key_visited(); + let retention = self + .generate_key_retention( + *last_key, + &accumulated_values, + gc_cutoff, + &retain_lsns_below_horizon, + COMPACTION_DELTA_THRESHOLD, + get_ancestor_image(self, *last_key, ctx).await?, + ) + .await?; + // Put the image into the image layer. Currently we have a single big layer for the compaction. + retention + .pipe_to( + *last_key, + &mut delta_values, + image_layer_writer.as_mut(), + &mut stat, + ctx, + ) + .await?; + delta_layers.extend( + flush_deltas( + &mut delta_values, + *last_key, + &delta_split_points, + &mut current_delta_split_point, + self, + lowest_retain_lsn, + ctx, + &mut stat, + dry_run, + false, + ) + .await?, + ); accumulated_values.clear(); - accumulated_values.push(item); - last_key = *key; + *last_key = key; + accumulated_values.push((key, lsn, val)); } } - let (deltas, image) = - flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?; - image_layer_writer.put_image(last_key, image, ctx).await?; - for (key, lsn, val) in deltas { - delta_layer_writer.put_value(key, lsn, val, ctx).await?; + + let last_key = last_key.expect("no keys produced during compaction"); + // TODO: move this part to the loop body + stat.on_unique_key_visited(); + let retention = self + .generate_key_retention( + last_key, + &accumulated_values, + gc_cutoff, + &retain_lsns_below_horizon, + COMPACTION_DELTA_THRESHOLD, + get_ancestor_image(self, last_key, ctx).await?, + ) + .await?; + // Put the image into the image layer. Currently we have a single big layer for the compaction. + retention + .pipe_to( + last_key, + &mut delta_values, + image_layer_writer.as_mut(), + &mut stat, + ctx, + ) + .await?; + delta_layers.extend( + flush_deltas( + &mut delta_values, + last_key, + &delta_split_points, + &mut current_delta_split_point, + self, + lowest_retain_lsn, + ctx, + &mut stat, + dry_run, + true, + ) + .await?, + ); + assert!(delta_values.is_empty(), "unprocessed keys"); + + let image_layer = if discard_image_layer { + stat.discard_image_layer(); + None + } else if let Some(writer) = image_layer_writer { + stat.produce_image_layer(writer.size()); + if !dry_run { + Some(writer.finish(self, ctx).await?) + } else { + None + } + } else { + None + }; + + info!( + "gc-compaction statistics: {}", + serde_json::to_string(&stat)? + ); + + if dry_run { + return Ok(()); } - accumulated_values.clear(); - // TODO: split layers - let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?; - let image_layer = image_layer_writer.finish(self, ctx).await?; + + info!( + "produced {} delta layers and {} image layers", + delta_layers.len(), + if image_layer.is_some() { 1 } else { 0 } + ); + let mut compact_to = Vec::new(); + let mut keep_layers = HashSet::new(); + for action in delta_layers { + match action { + FlushDeltaResult::CreateResidentLayer(layer) => { + compact_to.push(layer); + } + FlushDeltaResult::KeepLayer(l) => { + keep_layers.insert(l); + } + } + } + if discard_image_layer { + keep_layers.insert(image_layer_key); + } + let mut layer_selection = layer_selection; + layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); + compact_to.extend(image_layer); + // Step 3: Place back to the layer map. { let mut guard = self.layers.write().await; - guard.finish_gc_compaction( - &layer_selection, - &[delta_layer.clone(), image_layer.clone()], - &self.metrics, - ) + guard + .open_mut()? + .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics) }; + self.remote_client + .schedule_compaction_update(&layer_selection, &compact_to)?; + + drop(gc_lock); + Ok(()) } } @@ -1147,7 +2266,7 @@ impl TimelineAdaptor { } } - pub async fn flush_updates(&mut self) -> anyhow::Result<()> { + pub async fn flush_updates(&mut self) -> Result<(), CompactionError> { let layers_to_delete = { let guard = self.timeline.layers.read().await; self.layers_to_delete @@ -1195,7 +2314,7 @@ impl CompactionJobExecutor for TimelineAdaptor { self.flush_updates().await?; let guard = self.timeline.layers.read().await; - let layer_map = guard.layer_map(); + let layer_map = guard.layer_map()?; let result = layer_map .iter_historic_layers() @@ -1219,7 +2338,7 @@ impl CompactionJobExecutor for TimelineAdaptor { key_range, )) } else { - // The current compaction implementatin only ever requests the key space + // The current compaction implementation only ever requests the key space // at the compaction end LSN. anyhow::bail!("keyspace not available for requested lsn"); } @@ -1318,9 +2437,9 @@ impl CompactionJobExecutor for TimelineAdaptor { )) }); - let new_delta_layer = writer - .finish(prev.unwrap().0.next(), &self.timeline, ctx) - .await?; + let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?; + let new_delta_layer = + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; self.new_deltas.push(new_delta_layer); Ok(()) diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 441298f3e9..dc4118bb4a 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -63,10 +63,19 @@ pub(super) async fn delete_local_timeline_directory( tenant_shard_id: TenantShardId, timeline: &Timeline, ) -> anyhow::Result<()> { - let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) }; - let guards = crate::timed( - guards, - "acquire gc and compaction locks", + // Always ensure the lock order is compaction -> gc. + let compaction_lock = timeline.compaction_lock.lock(); + let compaction_lock = crate::timed( + compaction_lock, + "acquires compaction lock", + std::time::Duration::from_secs(5), + ) + .await; + + let gc_lock = timeline.gc_lock.lock(); + let gc_lock = crate::timed( + gc_lock, + "acquires gc lock", std::time::Duration::from_secs(5), ) .await; @@ -107,7 +116,8 @@ pub(super) async fn delete_local_timeline_directory( .context("fsync_pre_mark_remove")?; info!("finished deleting layer files, releasing locks"); - drop(guards); + drop(gc_lock); + drop(compaction_lock); fail::fail_point!("timeline-delete-after-rm", |_| { Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? @@ -148,14 +158,14 @@ async fn cleanup_remaining_timeline_fs_traces( /// For more context see comments in [`DeleteTimelineFlow::prepare`] async fn remove_timeline_from_tenant( tenant: &Tenant, - timeline_id: TimelineId, + timeline: &Timeline, _: &DeletionGuard, // using it as a witness ) -> anyhow::Result<()> { // Remove the timeline from the map. let mut timelines = tenant.timelines.lock().unwrap(); let children_exist = timelines .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id)); // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. // We already deleted the layer files, so it's probably best to panic. // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) @@ -164,7 +174,7 @@ async fn remove_timeline_from_tenant( } timelines - .remove(&timeline_id) + .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); drop(timelines); @@ -182,13 +192,15 @@ async fn remove_timeline_from_tenant( /// 5. Delete index part /// 6. Delete meta, timeline directory /// 7. Delete mark file +/// /// It is resumable from any step in case a crash/restart occurs. /// There are three entrypoints to the process: /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler. /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present -/// and we possibly neeed to continue deletion of remote files. +/// and we possibly neeed to continue deletion of remote files. /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote -/// index but still have local metadata, timeline directory and delete mark. +/// index but still have local metadata, timeline directory and delete mark. +/// /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load. #[derive(Default)] pub enum DeleteTimelineFlow { @@ -204,11 +216,10 @@ impl DeleteTimelineFlow { // NB: If this fails half-way through, and is retried, the retry will go through // all the same steps again. Make sure the code here is idempotent, and don't // error out if some of the shutdown tasks have already been completed! - #[instrument(skip_all, fields(%inplace))] + #[instrument(skip_all)] pub async fn run( tenant: &Arc, timeline_id: TimelineId, - inplace: bool, ) -> Result<(), DeleteTimelineError> { super::debug_assert_current_span_has_tenant_and_timeline_id(); @@ -219,6 +230,8 @@ impl DeleteTimelineFlow { // Now that the Timeline is in Stopping state, request all the related tasks to shut down. timeline.shutdown(super::ShutdownMode::Hard).await; + tenant.gc_block.before_delete(&timeline); + fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { Err(anyhow::anyhow!( "failpoint: timeline-delete-before-index-deleted-at" @@ -233,11 +246,7 @@ impl DeleteTimelineFlow { ))? }); - if inplace { - Self::background(guard, tenant.conf, tenant, &timeline).await? - } else { - Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline); - } + Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline); Ok(()) } @@ -255,7 +264,6 @@ impl DeleteTimelineFlow { } /// Shortcut to create Timeline in stopping state and spawn deletion task. - /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`] #[instrument(skip_all, fields(%timeline_id))] pub async fn resume_deletion( tenant: Arc, @@ -273,6 +281,7 @@ impl DeleteTimelineFlow { TimelineResources { remote_client, timeline_get_throttle: tenant.timeline_get_throttle.clone(), + l0_flush_global_state: tenant.l0_flush_global_state.clone(), }, // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. @@ -386,10 +395,9 @@ impl DeleteTimelineFlow { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_shard_id), + tenant_shard_id, Some(timeline_id), "timeline_delete", - false, async move { if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await { error!("Error: {err:#}"); @@ -413,17 +421,13 @@ impl DeleteTimelineFlow { pausable_failpoint!("in_progress_delete"); - remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; + remove_timeline_from_tenant(tenant, timeline, &guard).await?; *guard = Self::Finished; Ok(()) } - pub(crate) fn is_finished(&self) -> bool { - matches!(self, Self::Finished) - } - pub(crate) fn is_not_started(&self) -> bool { matches!(self, Self::NotStarted) } diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 4fc89330ba..641faada25 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -1,15 +1,19 @@ -use std::sync::Arc; +use std::{collections::HashSet, sync::Arc}; use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; use crate::{ context::{DownloadBehavior, RequestContext}, task_mgr::TaskKind, tenant::{ + remote_timeline_client::index::GcBlockingReason::DetachAncestor, storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}, Tenant, }, virtual_file::{MaybeFatalIo, VirtualFile}, }; +use anyhow::Context; +use pageserver_api::models::detach_ancestor::AncestorDetached; +use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; @@ -18,50 +22,96 @@ use utils::{completion, generation::Generation, http::error::ApiError, id::Timel pub(crate) enum Error { #[error("no ancestors")] NoAncestor, + #[error("too many ancestors")] TooManyAncestors, + #[error("shutting down, please retry later")] ShuttingDown, - #[error("flushing failed")] - FlushAncestor(#[source] FlushLayerError), - #[error("layer download failed")] - RewrittenDeltaDownloadFailed(#[source] anyhow::Error), - #[error("copying LSN prefix locally failed")] - CopyDeltaPrefix(#[source] anyhow::Error), - #[error("upload rewritten layer")] - UploadRewritten(#[source] anyhow::Error), + + #[error(transparent)] + NotFound(crate::tenant::GetTimelineError), + + #[error("failed to reparent all candidate timelines, please retry")] + FailedToReparentAll, #[error("ancestor is already being detached by: {}", .0)] OtherTimelineDetachOngoing(TimelineId), - #[error("remote copying layer failed")] - CopyFailed(#[source] anyhow::Error), + #[error("preparing to timeline ancestor detach failed")] + Prepare(#[source] anyhow::Error), - #[error("unexpected error")] - Unexpected(#[source] anyhow::Error), + #[error("detaching and reparenting failed")] + DetachReparent(#[source] anyhow::Error), + + #[error("completing ancestor detach failed")] + Complete(#[source] anyhow::Error), + + #[error("failpoint: {}", .0)] + Failpoint(&'static str), +} + +impl Error { + /// Try to catch cancellation from within the `anyhow::Error`, or wrap the anyhow as the given + /// variant or fancier `or_else`. + fn launder(e: anyhow::Error, or_else: F) -> Error + where + F: Fn(anyhow::Error) -> Error, + { + use crate::tenant::remote_timeline_client::WaitCompletionError; + use crate::tenant::upload_queue::NotInitialized; + use remote_storage::TimeoutOrCancel; + + if e.is::() + || TimeoutOrCancel::caused_by_cancel(&e) + || e.downcast_ref::() + .is_some_and(|e| e.is_cancelled()) + || e.is::() + { + Error::ShuttingDown + } else { + or_else(e) + } + } } impl From for ApiError { fn from(value: Error) -> Self { match value { - e @ Error::NoAncestor => ApiError::Conflict(e.to_string()), - // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError? - e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)), + Error::NoAncestor => ApiError::Conflict(value.to_string()), + Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", value)), Error::ShuttingDown => ApiError::ShuttingDown, - Error::OtherTimelineDetachOngoing(_) => { - ApiError::ResourceUnavailable("other timeline detach is already ongoing".into()) + Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => { + ApiError::ResourceUnavailable(value.to_string().into()) } - // All of these contain shutdown errors, in fact, it's the most common - e @ Error::FlushAncestor(_) - | e @ Error::RewrittenDeltaDownloadFailed(_) - | e @ Error::CopyDeltaPrefix(_) - | e @ Error::UploadRewritten(_) - | e @ Error::CopyFailed(_) - | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()), + Error::NotFound(e) => ApiError::from(e), + // these variants should have no cancellation errors because of Error::launder + Error::Prepare(_) + | Error::DetachReparent(_) + | Error::Complete(_) + | Error::Failpoint(_) => ApiError::InternalServerError(value.into()), } } } +impl From for Error { + fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self { + // treat all as shutting down signals, even though that is not entirely correct + // (uninitialized state) + Error::ShuttingDown + } +} +impl From for Error { + fn from(_: super::layer_manager::Shutdown) -> Self { + Error::ShuttingDown + } +} + +pub(crate) enum Progress { + Prepared(Attempt, PreparedTimelineDetach), + Done(AncestorDetached), +} + pub(crate) struct PreparedTimelineDetach { layers: Vec, } @@ -82,13 +132,33 @@ impl Default for Options { } } +/// Represents an across tenant reset exclusive single attempt to detach ancestor. +#[derive(Debug)] +pub(crate) struct Attempt { + pub(crate) timeline_id: TimelineId, + + _guard: completion::Completion, + gate_entered: Option, +} + +impl Attempt { + pub(crate) fn before_reset_tenant(&mut self) { + let taken = self.gate_entered.take(); + assert!(taken.is_some()); + } + + pub(crate) fn new_barrier(&self) -> completion::Barrier { + self._guard.barrier() + } +} + /// See [`Timeline::prepare_to_detach_from_ancestor`] pub(super) async fn prepare( detached: &Arc, tenant: &Tenant, options: Options, ctx: &RequestContext, -) -> Result<(completion::Completion, PreparedTimelineDetach), Error> { +) -> Result { use Error::*; let Some((ancestor, ancestor_lsn)) = detached @@ -96,15 +166,44 @@ pub(super) async fn prepare( .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { - // TODO: check if we have already been detached; for this we need to read the stored data - // on remote client, for that we need a follow-up which makes uploads cheaper and maintains - // a projection of the commited data. - // - // the error is wrong per openapi - return Err(NoAncestor); + let still_in_progress = { + let accessor = detached.remote_client.initialized_upload_queue()?; + + // we are safe to inspect the latest uploaded, because we can only witness this after + // restart is complete and ancestor is no more. + let latest = accessor.latest_uploaded_index_part(); + if latest.lineage.detached_previous_ancestor().is_none() { + return Err(NoAncestor); + }; + + latest + .gc_blocking + .as_ref() + .is_some_and(|b| b.blocked_by(DetachAncestor)) + }; + + if still_in_progress { + // gc is still blocked, we can still reparent and complete. + // we are safe to reparent remaining, because they were locked in in the beginning. + let attempt = continue_with_blocked_gc(detached, tenant).await?; + + // because the ancestor of detached is already set to none, we have published all + // of the layers, so we are still "prepared." + return Ok(Progress::Prepared( + attempt, + PreparedTimelineDetach { layers: Vec::new() }, + )); + } + + let reparented_timelines = reparented_direct_children(detached, tenant)?; + return Ok(Progress::Done(AncestorDetached { + reparented_timelines, + })); }; if !ancestor_lsn.is_valid() { + // rare case, probably wouldn't even load + tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing"); return Err(NoAncestor); } @@ -114,22 +213,16 @@ pub(super) async fn prepare( return Err(TooManyAncestors); } - // before we acquire the gate, we must mark the ancestor as having a detach operation - // ongoing which will block other concurrent detach operations so we don't get to ackward - // situations where there would be two branches trying to reparent earlier branches. - let (guard, barrier) = completion::channel(); + let attempt = start_new_attempt(detached, tenant).await?; - { - let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); - if let Some((tl, other)) = guard.as_ref() { - if !other.is_ready() { - return Err(OtherTimelineDetachOngoing(*tl)); - } - } - *guard = Some((detached.timeline_id, barrier)); - } + utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking-pausable"); - let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + fail::fail_point!( + "timeline-detach-ancestor::before_starting_after_locking", + |_| Err(Error::Failpoint( + "timeline-detach-ancestor::before_starting_after_locking" + )) + ); if ancestor_lsn >= ancestor.get_disk_consistent_lsn() { let span = @@ -151,7 +244,17 @@ pub(super) async fn prepare( } }; - res.map_err(FlushAncestor)?; + res.map_err(|e| { + use FlushLayerError::*; + match e { + Cancelled | NotRunning(_) => { + // FIXME(#6424): technically statically unreachable right now, given how we never + // drop the sender + Error::ShuttingDown + } + CreateImageLayersError(_) | Other(_) => Error::Prepare(e.into()), + } + })?; // we do not need to wait for uploads to complete but we do need `struct Layer`, // copying delta prefix is unsupported currently for `InMemoryLayer`. @@ -159,7 +262,7 @@ pub(super) async fn prepare( elapsed_ms = started_at.elapsed().as_millis(), "froze and flushed the ancestor" ); - Ok(()) + Ok::<_, Error>(()) } .instrument(span) .await?; @@ -182,11 +285,12 @@ pub(super) async fn prepare( // between retries, these can change if compaction or gc ran in between. this will mean // we have to redo work. - partition_work(ancestor_lsn, &layers) + partition_work(ancestor_lsn, &layers)? }; // TODO: layers are already sorted by something: use that to determine how much of remote - // copies are already done. + // copies are already done -- gc is blocked, but a compaction could had happened on ancestor, + // which is something to keep in mind if copy skipping is implemented. tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers"); // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after @@ -200,34 +304,38 @@ pub(super) async fn prepare( let mut wrote_any = false; - let limiter = Arc::new(tokio::sync::Semaphore::new( - options.rewrite_concurrency.get(), - )); + let limiter = Arc::new(Semaphore::new(options.rewrite_concurrency.get())); for layer in straddling_branchpoint { let limiter = limiter.clone(); let timeline = detached.clone(); let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download); - tasks.spawn(async move { - let _permit = limiter.acquire().await; - let copied = - upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) - .await?; - Ok(copied) - }); + let span = tracing::info_span!("upload_rewritten_layer", %layer); + tasks.spawn( + async move { + let _permit = limiter.acquire().await; + let copied = + upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) + .await?; + if let Some(copied) = copied.as_ref() { + tracing::info!(%copied, "rewrote and uploaded"); + } + Ok(copied) + } + .instrument(span), + ); } while let Some(res) = tasks.join_next().await { match res { Ok(Ok(Some(copied))) => { wrote_any = true; - tracing::info!(layer=%copied, "rewrote and uploaded"); new_layers.push(copied); } Ok(Ok(None)) => {} Ok(Err(e)) => return Err(e), - Err(je) => return Err(Unexpected(je.into())), + Err(je) => return Err(Error::Prepare(je.into())), } } @@ -249,7 +357,7 @@ pub(super) async fn prepare( } let mut tasks = tokio::task::JoinSet::new(); - let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get())); + let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get())); for adopted in rest_of_historic { let limiter = limiter.clone(); @@ -275,7 +383,7 @@ pub(super) async fn prepare( Ok(Err(failed)) => { return Err(failed); } - Err(je) => return Err(Unexpected(je.into())), + Err(je) => return Err(Error::Prepare(je.into())), } } @@ -283,19 +391,118 @@ pub(super) async fn prepare( let prepared = PreparedTimelineDetach { layers: new_layers }; - Ok((guard, prepared)) + Ok(Progress::Prepared(attempt, prepared)) +} + +async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result { + let attempt = obtain_exclusive_attempt(detached, tenant)?; + + // insert the block in the index_part.json, if not already there. + let _dont_care = tenant + .gc_block + .insert( + detached, + crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, + ) + .await + .map_err(|e| Error::launder(e, Error::Prepare))?; + + Ok(attempt) +} + +async fn continue_with_blocked_gc(detached: &Timeline, tenant: &Tenant) -> Result { + // FIXME: it would be nice to confirm that there is an in-memory version, since we've just + // verified there is a persistent one? + obtain_exclusive_attempt(detached, tenant) +} + +fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result { + use Error::{OtherTimelineDetachOngoing, ShuttingDown}; + + // ensure we are the only active attempt for this tenant + let (guard, barrier) = completion::channel(); + { + let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); + if let Some((tl, other)) = guard.as_ref() { + if !other.is_ready() { + return Err(OtherTimelineDetachOngoing(*tl)); + } + // FIXME: no test enters here + } + *guard = Some((detached.timeline_id, barrier)); + } + + // ensure the gate is still open + let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + + Ok(Attempt { + timeline_id: detached.timeline_id, + _guard: guard, + gate_entered: Some(_gate_entered), + }) +} + +fn reparented_direct_children( + detached: &Arc, + tenant: &Tenant, +) -> Result, Error> { + let mut all_direct_children = tenant + .timelines + .lock() + .unwrap() + .values() + .filter_map(|tl| { + let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)); + + if is_direct_child { + Some(tl.clone()) + } else { + if let Some(timeline) = tl.ancestor_timeline.as_ref() { + assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live"); + } + None + } + }) + // Collect to avoid lock taking order problem with Tenant::timelines and + // Timeline::remote_client + .collect::>(); + + let mut any_shutdown = false; + + all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() { + Ok(accessor) => accessor + .latest_uploaded_index_part() + .lineage + .is_reparented(), + Err(_shutdownalike) => { + // not 100% a shutdown, but let's bail early not to give inconsistent results in + // sharded enviroment. + any_shutdown = true; + true + } + }); + + if any_shutdown { + // it could be one or many being deleted; have client retry + return Err(Error::ShuttingDown); + } + + Ok(all_direct_children + .into_iter() + .map(|tl| tl.timeline_id) + .collect()) } fn partition_work( ancestor_lsn: Lsn, - source_layermap: &LayerManager, -) -> (usize, Vec, Vec) { + source: &LayerManager, +) -> Result<(usize, Vec, Vec), Error> { let mut straddling_branchpoint = vec![]; let mut rest_of_historic = vec![]; let mut later_by_lsn = 0; - for desc in source_layermap.layer_map().iter_historic_layers() { + for desc in source.layer_map()?.iter_historic_layers() { // off by one chances here: // - start is inclusive // - end is exclusive @@ -314,10 +521,10 @@ fn partition_work( &mut rest_of_historic }; - target.push(source_layermap.get_from_desc(&desc)); + target.push(source.get_from_desc(&desc)); } - (later_by_lsn, straddling_branchpoint, rest_of_historic) + Ok((later_by_lsn, straddling_branchpoint, rest_of_historic)) } async fn upload_rewritten_layer( @@ -327,19 +534,17 @@ async fn upload_rewritten_layer( cancel: &CancellationToken, ctx: &RequestContext, ) -> Result, Error> { - use Error::UploadRewritten; let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?; let Some(copied) = copied else { return Ok(None); }; - // FIXME: better shuttingdown error target .remote_client .upload_layer_file(&copied, cancel) .await - .map_err(UploadRewritten)?; + .map_err(|e| Error::launder(e, Error::Prepare))?; Ok(Some(copied.into())) } @@ -350,7 +555,9 @@ async fn copy_lsn_prefix( target_timeline: &Arc, ctx: &RequestContext, ) -> Result, Error> { - use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed}; + if target_timeline.cancel.is_cancelled() { + return Err(Error::ShuttingDown); + } tracing::debug!(%layer, %end_lsn, "copying lsn prefix"); @@ -363,18 +570,22 @@ async fn copy_lsn_prefix( ctx, ) .await - .map_err(CopyDeltaPrefix)?; + .with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}")) + .map_err(Error::Prepare)?; - let resident = layer - .download_and_keep_resident() - .await - // likely shutdown - .map_err(RewrittenDeltaDownloadFailed)?; + let resident = layer.download_and_keep_resident().await.map_err(|e| { + if e.is_cancelled() { + Error::ShuttingDown + } else { + Error::Prepare(e.into()) + } + })?; let records = resident .copy_delta_prefix(&mut writer, end_lsn, ctx) .await - .map_err(CopyDeltaPrefix)?; + .with_context(|| format!("copy lsn prefix of ancestors {layer}")) + .map_err(Error::Prepare)?; drop(resident); @@ -389,10 +600,12 @@ async fn copy_lsn_prefix( // reuse the key instead of adding more holes between layers by using the real // highest key in the layer. let reused_highest_key = layer.layer_desc().key_range.end; - let copied = writer - .finish(reused_highest_key, target_timeline, ctx) + let (desc, path) = writer + .finish(reused_highest_key, ctx) .await - .map_err(CopyDeltaPrefix)?; + .map_err(Error::Prepare)?; + let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path) + .map_err(Error::Prepare)?; tracing::debug!(%layer, %copied, "new layer produced"); @@ -408,8 +621,6 @@ async fn remote_copy( generation: Generation, cancel: &CancellationToken, ) -> Result { - use Error::CopyFailed; - // depending if Layer::keep_resident we could hardlink let mut metadata = adopted.metadata(); @@ -423,133 +634,319 @@ async fn remote_copy( metadata, ); - // FIXME: better shuttingdown error adoptee .remote_client .copy_timeline_layer(adopted, &owned, cancel) .await .map(move |()| owned) - .map_err(CopyFailed) + .map_err(|e| Error::launder(e, Error::Prepare)) } -/// See [`Timeline::complete_detaching_timeline_ancestor`]. -pub(super) async fn complete( +pub(crate) enum DetachingAndReparenting { + /// All of the following timeline ids were reparented and the timeline ancestor detach must be + /// marked as completed. + Reparented(HashSet), + + /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as + /// completed. + /// + /// Nested `must_reset_tenant` is set to true when any restart requiring changes were made. + SomeReparentingFailed { must_reset_tenant: bool }, + + /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach + /// must be marked as completed. + AlreadyDone(HashSet), +} + +impl DetachingAndReparenting { + pub(crate) fn reset_tenant_required(&self) -> bool { + use DetachingAndReparenting::*; + match self { + Reparented(_) => true, + SomeReparentingFailed { must_reset_tenant } => *must_reset_tenant, + AlreadyDone(_) => false, + } + } + + pub(crate) fn completed(self) -> Option> { + use DetachingAndReparenting::*; + match self { + Reparented(x) | AlreadyDone(x) => Some(x), + SomeReparentingFailed { .. } => None, + } + } +} + +/// See [`Timeline::detach_from_ancestor_and_reparent`]. +pub(super) async fn detach_and_reparent( detached: &Arc, tenant: &Tenant, prepared: PreparedTimelineDetach, _ctx: &RequestContext, -) -> Result, anyhow::Error> { +) -> Result { let PreparedTimelineDetach { layers } = prepared; - let ancestor = detached - .get_ancestor_timeline() - .expect("must still have a ancestor"); - let ancestor_lsn = detached.get_ancestor_lsn(); + #[derive(Debug)] + enum Ancestor { + NotDetached(Arc, Lsn), + Detached(Arc, Lsn), + } + + let (recorded_branchpoint, still_ongoing) = { + let access = detached.remote_client.initialized_upload_queue()?; + let latest = access.latest_uploaded_index_part(); + + ( + latest.lineage.detached_previous_ancestor(), + latest + .gc_blocking + .as_ref() + .is_some_and(|b| b.blocked_by(DetachAncestor)), + ) + }; + assert!( + still_ongoing, + "cannot (detach? reparent)? complete if the operation is not still ongoing" + ); + + let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) { + (Some(ancestor), None) => { + assert!( + !layers.is_empty(), + "there should always be at least one layer to inherit" + ); + Ancestor::NotDetached(ancestor.clone(), detached.ancestor_lsn) + } + (Some(_), Some(_)) => { + panic!( + "it should be impossible to get to here without having gone through the tenant reset; if the tenant was reset, then the ancestor_timeline would be None" + ); + } + (None, Some((ancestor_id, ancestor_lsn))) => { + // it has been either: + // - detached but still exists => we can try reparenting + // - detached and deleted + // + // either way, we must complete + assert!( + layers.is_empty(), + "no layers should had been copied as detach is done" + ); + + let existing = tenant.timelines.lock().unwrap().get(&ancestor_id).cloned(); + + if let Some(ancestor) = existing { + Ancestor::Detached(ancestor, ancestor_lsn) + } else { + let direct_children = reparented_direct_children(detached, tenant)?; + return Ok(DetachingAndReparenting::AlreadyDone(direct_children)); + } + } + (None, None) => { + // TODO: make sure there are no `?` before tenant_reset from after a questionmark from + // here. + panic!( + "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" + ); + } + }; // publish the prepared layers before we reparent any of the timelines, so that on restart // reparented timelines find layers. also do the actual detaching. // - // if we crash after this operation, we will at least come up having detached a timeline, but - // we cannot go back and reparent the timelines which would had been reparented in normal - // execution. - // - // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart - // which could give us a completely wrong layer combination. - detached - .remote_client - .schedule_adding_existing_layers_to_index_detach_and_wait( - &layers, - (ancestor.timeline_id, ancestor_lsn), - ) - .await?; + // if we crash after this operation, a retry will allow reparenting the remaining timelines as + // gc is blocked. + + let (ancestor, ancestor_lsn, was_detached) = match ancestor { + Ancestor::NotDetached(ancestor, ancestor_lsn) => { + // this has to complete before any reparentings because otherwise they would not have + // layers on the new parent. + detached + .remote_client + .schedule_adding_existing_layers_to_index_detach_and_wait( + &layers, + (ancestor.timeline_id, ancestor_lsn), + ) + .await + .context("publish layers and detach ancestor") + .map_err(|e| Error::launder(e, Error::DetachReparent))?; + + tracing::info!( + ancestor=%ancestor.timeline_id, + %ancestor_lsn, + inherited_layers=%layers.len(), + "detached from ancestor" + ); + (ancestor, ancestor_lsn, true) + } + Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false), + }; let mut tasks = tokio::task::JoinSet::new(); + // Returns a single permit semaphore which will be used to make one reparenting succeed, + // others will fail as if those timelines had been stopped for whatever reason. + #[cfg(feature = "testing")] + let failpoint_sem = || -> Option> { + fail::fail_point!("timeline-detach-ancestor::allow_one_reparented", |_| Some( + Arc::new(Semaphore::new(1)) + )); + None + }(); + // because we are now keeping the slot in progress, it is unlikely that there will be any // timeline deletions during this time. if we raced one, then we'll just ignore it. - tenant - .timelines - .lock() - .unwrap() - .values() - .filter_map(|tl| { - if Arc::ptr_eq(tl, detached) { - return None; - } + { + let g = tenant.timelines.lock().unwrap(); + reparentable_timelines(g.values(), detached, &ancestor, ancestor_lsn) + .cloned() + .for_each(|timeline| { + // important in this scope: we are holding the Tenant::timelines lock + let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); + let new_parent = detached.timeline_id; + #[cfg(feature = "testing")] + let failpoint_sem = failpoint_sem.clone(); - if !tl.is_active() { - return None; - } + tasks.spawn( + async move { + let res = async { + #[cfg(feature = "testing")] + if let Some(failpoint_sem) = failpoint_sem { + let _permit = failpoint_sem.acquire().await.map_err(|_| { + anyhow::anyhow!( + "failpoint: timeline-detach-ancestor::allow_one_reparented", + ) + })?; + failpoint_sem.close(); + } - let tl_ancestor = tl.ancestor_timeline.as_ref()?; - let is_same = Arc::ptr_eq(&ancestor, tl_ancestor); - let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; - - let is_deleting = tl - .delete_progress - .try_lock() - .map(|flow| !flow.is_not_started()) - .unwrap_or(true); - - if is_same && is_earlier && !is_deleting { - Some(tl.clone()) - } else { - None - } - }) - .for_each(|timeline| { - // important in this scope: we are holding the Tenant::timelines lock - let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); - let new_parent = detached.timeline_id; - - tasks.spawn( - async move { - let res = timeline - .remote_client - .schedule_reparenting_and_wait(&new_parent) + timeline + .remote_client + .schedule_reparenting_and_wait(&new_parent) + .await + } .await; - match res { - Ok(()) => Some(timeline), - Err(e) => { - // with the use of tenant slot, we no longer expect these. - tracing::warn!("reparenting failed: {e:#}"); - None + match res { + Ok(()) => { + tracing::info!("reparented"); + Some(timeline) + } + Err(e) => { + // with the use of tenant slot, raced timeline deletion is the most + // likely reason. + tracing::warn!("reparenting failed: {e:#}"); + None + } } } - } - .instrument(span), - ); - }); + .instrument(span), + ); + }); + } let reparenting_candidates = tasks.len(); - let mut reparented = Vec::with_capacity(tasks.len()); + let mut reparented = HashSet::with_capacity(tasks.len()); while let Some(res) = tasks.join_next().await { match res { Ok(Some(timeline)) => { - tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); - reparented.push(timeline.timeline_id); - } - Ok(None) => { - // lets just ignore this for now. one or all reparented timelines could had - // started deletion, and that is fine. + assert!( + reparented.insert(timeline.timeline_id), + "duplicate reparenting? timeline_id={}", + timeline.timeline_id + ); } Err(je) if je.is_cancelled() => unreachable!("not used"), - Err(je) if je.is_panic() => { - // ignore; it's better to continue with a single reparenting failing (or even - // all of them) in order to get to the goal state. - // - // these timelines will never be reparentable, but they can be always detached as - // separate tree roots. - } + // just ignore failures now, we can retry + Ok(None) => {} + Err(je) if je.is_panic() => {} Err(je) => tracing::error!("unexpected join error: {je:?}"), } } - if reparenting_candidates != reparented.len() { - tracing::info!("failed to reparent some candidates"); + let reparented_all = reparenting_candidates == reparented.len(); + + if reparented_all { + Ok(DetachingAndReparenting::Reparented(reparented)) + } else { + tracing::info!( + reparented = reparented.len(), + candidates = reparenting_candidates, + "failed to reparent all candidates; they can be retried after the tenant_reset", + ); + + let must_reset_tenant = !reparented.is_empty() || was_detached; + Ok(DetachingAndReparenting::SomeReparentingFailed { must_reset_tenant }) + } +} + +pub(super) async fn complete( + detached: &Arc, + tenant: &Tenant, + mut attempt: Attempt, + _ctx: &RequestContext, +) -> Result<(), Error> { + assert_eq!(detached.timeline_id, attempt.timeline_id); + + if attempt.gate_entered.is_none() { + let entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?; + attempt.gate_entered = Some(entered); + } else { + // Some(gate_entered) means the tenant was not restarted, as is not required } - Ok(reparented) + assert!(detached.ancestor_timeline.is_none()); + + // this should be an 503 at least...? + fail::fail_point!( + "timeline-detach-ancestor::complete_before_uploading", + |_| Err(Error::Failpoint( + "timeline-detach-ancestor::complete_before_uploading" + )) + ); + + tenant + .gc_block + .remove( + detached, + crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, + ) + .await + .map_err(|e| Error::launder(e, Error::Complete))?; + + Ok(()) +} + +/// Query against a locked `Tenant::timelines`. +fn reparentable_timelines<'a, I>( + timelines: I, + detached: &'a Arc, + ancestor: &'a Arc, + ancestor_lsn: Lsn, +) -> impl Iterator> + 'a +where + I: Iterator> + 'a, +{ + timelines.filter_map(move |tl| { + if Arc::ptr_eq(tl, detached) { + return None; + } + + let tl_ancestor = tl.ancestor_timeline.as_ref()?; + let is_same = Arc::ptr_eq(ancestor, tl_ancestor); + let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; + + let is_deleting = tl + .delete_progress + .try_lock() + .map(|flow| !flow.is_not_started()) + .unwrap_or(true); + + if is_same && is_earlier && !is_deleting { + Some(tl) + } else { + None + } + }) } diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 8a8c38d0ce..2f6cb4d73a 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -30,7 +30,8 @@ use crate::{ pgdatadir_mapping::CollectKeySpaceError, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ - tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, + storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError, + LogicalSizeCalculationCause, Tenant, }, }; @@ -59,13 +60,12 @@ impl Timeline { task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Eviction, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), &format!( "layer eviction for {}/{}", self.tenant_shard_id, self.timeline_id ), - false, async move { tokio::select! { _ = self_clone.cancel.cancelled() => { return Ok(()); } @@ -214,51 +214,60 @@ impl Timeline { let mut js = tokio::task::JoinSet::new(); { let guard = self.layers.read().await; - let layers = guard.layer_map(); - for layer in layers.iter_historic_layers() { - let layer = guard.get_from_desc(&layer); - // guard against eviction while we inspect it; it might be that eviction_task and - // disk_usage_eviction_task both select the same layers to be evicted, and - // seemingly free up double the space. both succeeding is of no consequence. + guard + .likely_resident_layers() + .filter(|layer| { + let last_activity_ts = layer.latest_activity(); - if !layer.is_likely_resident() { - continue; - } + let no_activity_for = match now.duration_since(last_activity_ts) { + Ok(d) => d, + Err(_e) => { + // We reach here if `now` < `last_activity_ts`, which can legitimately + // happen if there is an access between us getting `now`, and us getting + // the access stats from the layer. + // + // The other reason why it can happen is system clock skew because + // SystemTime::now() is not monotonic, so, even if there is no access + // to the layer after we get `now` at the beginning of this function, + // it could be that `now` < `last_activity_ts`. + // + // To distinguish the cases, we would need to record `Instant`s in the + // access stats (i.e., monotonic timestamps), but then, the timestamps + // values in the access stats would need to be `Instant`'s, and hence + // they would be meaningless outside of the pageserver process. + // At the time of writing, the trade-off is that access stats are more + // valuable than detecting clock skew. + return false; + } + }; - let last_activity_ts = layer.access_stats().latest_activity_or_now(); - - let no_activity_for = match now.duration_since(last_activity_ts) { - Ok(d) => d, - Err(_e) => { - // We reach here if `now` < `last_activity_ts`, which can legitimately - // happen if there is an access between us getting `now`, and us getting - // the access stats from the layer. - // - // The other reason why it can happen is system clock skew because - // SystemTime::now() is not monotonic, so, even if there is no access - // to the layer after we get `now` at the beginning of this function, - // it could be that `now` < `last_activity_ts`. - // - // To distinguish the cases, we would need to record `Instant`s in the - // access stats (i.e., monotonic timestamps), but then, the timestamps - // values in the access stats would need to be `Instant`'s, and hence - // they would be meaningless outside of the pageserver process. - // At the time of writing, the trade-off is that access stats are more - // valuable than detecting clock skew. - continue; + match layer.visibility() { + LayerVisibilityHint::Visible => { + // Usual case: a visible layer might be read any time, and we will keep it + // resident until it hits our configured TTL threshold. + no_activity_for > p.threshold + } + LayerVisibilityHint::Covered => { + // Covered layers: this is probably a layer that was recently covered by + // an image layer during compaction. We don't evict it immediately, but + // it doesn't stay resident for the full `threshold`: we just keep it + // for a shorter time in case + // - it is used for Timestamp->LSN lookups + // - a new branch is created in recent history which will read this layer + no_activity_for > p.period + } } - }; - - if no_activity_for > p.threshold { + }) + .cloned() + .for_each(|layer| { js.spawn(async move { layer .evict_and_wait(std::time::Duration::from_secs(5)) .await }); stats.candidates += 1; - } - } + }); }; let join_all = async move { diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs new file mode 100644 index 0000000000..e82559b8b3 --- /dev/null +++ b/pageserver/src/tenant/timeline/handle.rs @@ -0,0 +1,967 @@ +//! An efficient way to keep the timeline gate open without preventing +//! timeline shutdown for longer than a single call to a timeline method. +//! +//! # Motivation +//! +//! On a single page service connection, we're typically serving a single TenantTimelineId. +//! +//! Without sharding, there is a single Timeline object to which we dispatch +//! all requests. For example, a getpage request gets dispatched to the +//! Timeline::get method of the Timeline object that represents the +//! (tenant,timeline) of that connection. +//! +//! With sharding, for each request that comes in on the connection, +//! we first have to perform shard routing based on the requested key (=~ page number). +//! The result of shard routing is a Timeline object. +//! We then dispatch the request to that Timeline object. +//! +//! Regardless of whether the tenant is sharded or not, we want to ensure that +//! we hold the Timeline gate open while we're invoking the method on the +//! Timeline object. +//! +//! However, we want to avoid the overhead of entering the gate for every +//! method invocation. +//! +//! Further, for shard routing, we want to avoid calling the tenant manager to +//! resolve the shard for every request. Instead, we want to cache the +//! routing result so we can bypass the tenant manager for all subsequent requests +//! that get routed to that shard. +//! +//! Regardless of how we accomplish the above, it should not +//! prevent the Timeline from shutting down promptly. +//! +//! # Design +//! +//! There are three user-facing data structures: +//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime. +//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime. +//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`. +//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request) +//! +//! The `Handle` is just a wrapper around an `Arc`. +//! +//! There is one long-lived `Arc`, which is stored in the `PerTimelineState`. +//! The `Cache` stores a `Weak` for each cached Timeline. +//! +//! To dispatch a request, the page service connection calls `Cache::get`. +//! +//! A cache miss means we consult the tenant manager for shard routing, +//! resulting in an `Arc`. We enter its gate _once_ and construct an +//! `Arc`. We store a `Weak` in the cache +//! and the `Arc` in the `PerTimelineState`. +//! +//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing +//! and find the `Weak` in the cache. +//! We upgrade the `Weak` to an `Arc` and wrap it in the user-facing `Handle` type. +//! +//! The request handler dispatches the request to the right `>::$request_method`. +//! It then drops the `Handle`, which drops the `Arc`. +//! +//! # Memory Management / How The Reference Cycle Is Broken +//! +//! The attentive reader may have noticed the strong reference cycle +//! from `Arc` to `PerTimelineState` to `Arc`. +//! +//! This cycle is intentional: while it exists, the `Cache` can upgrade its +//! `Weak` to an `Arc` in a single atomic operation. +//! +//! The cycle is broken by either +//! - `PerTimelineState::shutdown` or +//! - dropping the `Cache`. +//! +//! Concurrently existing `Handle`s will extend the existence of the cycle. +//! However, since `Handle`s are short-lived and new `Handle`s are not +//! handed out after either `PerTimelineState::shutdown` or `Cache` drop, +//! that extension of the cycle is bounded. +//! +//! # Fast Path for Shard Routing +//! +//! The `Cache` has a fast path for shard routing to avoid calling into +//! the tenant manager for every request. +//! +//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak`. +//! +//! The current implementation uses the first entry in the hash map +//! to determine the `ShardParameters` and derive the correct +//! `ShardIndex` for the requested key. +//! +//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`. +//! +//! If the lookup is successful and the `Weak` can be upgraded, +//! it's a hit. +//! +//! ## Cache invalidation +//! +//! The insight is that cache invalidation is sufficient and most efficiently done lazily. +//! The only reasons why an entry in the cache can become stale are: +//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is +//! being detached, timeline or shard deleted, or pageserver is shutting down. +//! 2. We're doing a shard split and new traffic should be routed to the child shards. +//! +//! Regarding (1), we will eventually fail to upgrade the `Weak` once the +//! timeline has shut down, and when that happens, we remove the entry from the cache. +//! +//! Regarding (2), the insight is that it is toally fine to keep dispatching requests +//! to the parent shard during a shard split. Eventually, the shard split task will +//! shut down the parent => case (1). + +use std::collections::hash_map; +use std::collections::HashMap; +use std::sync::atomic::AtomicBool; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::Weak; + +use pageserver_api::shard::ShardIdentity; +use tracing::instrument; +use tracing::trace; +use utils::id::TimelineId; +use utils::shard::ShardIndex; +use utils::shard::ShardNumber; + +use crate::tenant::mgr::ShardSelector; + +/// The requirement for Debug is so that #[derive(Debug)] works in some places. +pub(crate) trait Types: Sized + std::fmt::Debug { + type TenantManagerError: Sized + std::fmt::Debug; + type TenantManager: TenantManager + Sized; + type Timeline: ArcTimeline + Sized; +} + +/// Uniquely identifies a [`Cache`] instance over the lifetime of the process. +/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`]. +/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer. +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +struct CacheId(u64); + +impl CacheId { + fn next() -> Self { + static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); + let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + if id == 0 { + panic!("CacheId::new() returned 0, overflow"); + } + Self(id) + } +} + +/// See module-level comment. +pub(crate) struct Cache { + id: CacheId, + map: Map, +} + +type Map = HashMap>>; + +impl Default for Cache { + fn default() -> Self { + Self { + id: CacheId::next(), + map: Default::default(), + } + } +} + +#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)] +pub(crate) struct ShardTimelineId { + pub(crate) shard_index: ShardIndex, + pub(crate) timeline_id: TimelineId, +} + +/// See module-level comment. +pub(crate) struct Handle(Arc>); +struct HandleInner { + shut_down: AtomicBool, + timeline: T::Timeline, + // The timeline's gate held open. + _gate_guard: utils::sync::gate::GateGuard, +} + +/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`. +/// +/// See module-level comment for details. +pub struct PerTimelineState { + // None = shutting down + handles: Mutex>>>>, +} + +impl Default for PerTimelineState { + fn default() -> Self { + Self { + handles: Mutex::new(Some(Default::default())), + } + } +} + +/// Abstract view of [`crate::tenant::mgr`], for testability. +pub(crate) trait TenantManager { + /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`]. + /// Errors are returned as [`GetError::TenantManager`]. + async fn resolve( + &self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> Result; +} + +/// Abstract view of an [`Arc`], for testability. +pub(crate) trait ArcTimeline: Clone { + fn gate(&self) -> &utils::sync::gate::Gate; + fn shard_timeline_id(&self) -> ShardTimelineId; + fn get_shard_identity(&self) -> &ShardIdentity; + fn per_timeline_state(&self) -> &PerTimelineState; +} + +/// Errors returned by [`Cache::get`]. +#[derive(Debug)] +pub(crate) enum GetError { + TenantManager(T::TenantManagerError), + TimelineGateClosed, + PerTimelineStateShutDown, +} + +/// Internal type used in [`Cache::get`]. +enum RoutingResult { + FastPath(Handle), + SlowPath(ShardTimelineId), + NeedConsultTenantManager, +} + +impl Cache { + /// See module-level comment for details. + /// + /// Does NOT check for the shutdown state of [`Types::Timeline`]. + /// Instead, the methods of [`Types::Timeline`] that are invoked through + /// the [`Handle`] are responsible for checking these conditions + /// and if so, return an error that causes the page service to + /// close the connection. + #[instrument(level = "trace", skip_all)] + pub(crate) async fn get( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + // terminates because each iteration removes an element from the map + loop { + let handle = self + .get_impl(timeline_id, shard_selector, tenant_manager) + .await?; + if handle.0.shut_down.load(Ordering::Relaxed) { + let removed = self + .map + .remove(&handle.0.timeline.shard_timeline_id()) + .expect("invariant of get_impl is that the returned handle is in the map"); + assert!( + Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)), + "shard_timeline_id() incorrect?" + ); + } else { + return Ok(handle); + } + } + } + + #[instrument(level = "trace", skip_all)] + async fn get_impl( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + let miss: ShardSelector = { + let routing_state = self.shard_routing(timeline_id, shard_selector); + match routing_state { + RoutingResult::FastPath(handle) => return Ok(handle), + RoutingResult::SlowPath(key) => match self.map.get(&key) { + Some(cached) => match cached.upgrade() { + Some(upgraded) => return Ok(Handle(upgraded)), + None => { + trace!("handle cache stale"); + self.map.remove(&key).unwrap(); + ShardSelector::Known(key.shard_index) + } + }, + None => ShardSelector::Known(key.shard_index), + }, + RoutingResult::NeedConsultTenantManager => shard_selector, + } + }; + self.get_miss(timeline_id, miss, tenant_manager).await + } + + #[inline(always)] + fn shard_routing( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> RoutingResult { + loop { + // terminates because when every iteration we remove an element from the map + let Some((first_key, first_handle)) = self.map.iter().next() else { + return RoutingResult::NeedConsultTenantManager; + }; + let Some(first_handle) = first_handle.upgrade() else { + // TODO: dedup with get() + trace!("handle cache stale"); + let first_key_owned = *first_key; + self.map.remove(&first_key_owned).unwrap(); + continue; + }; + + let first_handle_shard_identity = first_handle.timeline.get_shard_identity(); + let make_shard_index = |shard_num: ShardNumber| ShardIndex { + shard_number: shard_num, + shard_count: first_handle_shard_identity.count, + }; + + let need_idx = match shard_selector { + ShardSelector::Page(key) => { + make_shard_index(first_handle_shard_identity.get_shard_number(&key)) + } + ShardSelector::Zero => make_shard_index(ShardNumber(0)), + ShardSelector::Known(shard_idx) => shard_idx, + }; + let need_shard_timeline_id = ShardTimelineId { + shard_index: need_idx, + timeline_id, + }; + let first_handle_shard_timeline_id = ShardTimelineId { + shard_index: first_handle_shard_identity.shard_index(), + timeline_id: first_handle.timeline.shard_timeline_id().timeline_id, + }; + + if need_shard_timeline_id == first_handle_shard_timeline_id { + return RoutingResult::FastPath(Handle(first_handle)); + } else { + return RoutingResult::SlowPath(need_shard_timeline_id); + } + } + } + + #[instrument(level = "trace", skip_all)] + #[inline(always)] + async fn get_miss( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + match tenant_manager.resolve(timeline_id, shard_selector).await { + Ok(timeline) => { + let key = timeline.shard_timeline_id(); + match &shard_selector { + ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)), + ShardSelector::Page(_) => (), // gotta trust tenant_manager + ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index), + } + + let gate_guard = match timeline.gate().enter() { + Ok(guard) => guard, + Err(_) => { + return Err(GetError::TimelineGateClosed); + } + }; + trace!("creating new HandleInner"); + let handle = Arc::new( + // TODO: global metric that keeps track of the number of live HandlerTimeline instances + // so we can identify reference cycle bugs. + HandleInner { + shut_down: AtomicBool::new(false), + _gate_guard: gate_guard, + timeline: timeline.clone(), + }, + ); + let handle = { + let mut lock_guard = timeline + .per_timeline_state() + .handles + .lock() + .expect("mutex poisoned"); + match &mut *lock_guard { + Some(per_timeline_state) => { + let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle)); + assert!(replaced.is_none(), "some earlier code left a stale handle"); + match self.map.entry(key) { + hash_map::Entry::Occupied(_o) => { + // This cannot not happen because + // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and + // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle + // while we were waiting for the tenant manager. + unreachable!() + } + hash_map::Entry::Vacant(v) => { + v.insert(Arc::downgrade(&handle)); + handle + } + } + } + None => { + return Err(GetError::PerTimelineStateShutDown); + } + } + }; + Ok(Handle(handle)) + } + Err(e) => Err(GetError::TenantManager(e)), + } + } +} + +impl PerTimelineState { + /// After this method returns, [`Cache::get`] will never again return a [`Handle`] + /// to the [`Types::Timeline`] that embeds this per-timeline state. + /// Even if [`TenantManager::resolve`] would still resolve to it. + /// + /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive. + /// That's ok because they're short-lived. See module-level comment for details. + #[instrument(level = "trace", skip_all)] + pub(super) fn shutdown(&self) { + let handles = self + .handles + .lock() + .expect("mutex poisoned") + // NB: this .take() sets locked to None. + // That's what makes future `Cache::get` misses fail. + // Cache hits are taken care of below. + .take(); + let Some(handles) = handles else { + trace!("already shut down"); + return; + }; + for handle in handles.values() { + // Make hits fail. + handle.shut_down.store(true, Ordering::Relaxed); + } + drop(handles); + } +} + +impl std::ops::Deref for Handle { + type Target = T::Timeline; + fn deref(&self) -> &Self::Target { + &self.0.timeline + } +} + +#[cfg(test)] +impl Drop for HandleInner { + fn drop(&mut self) { + trace!("HandleInner dropped"); + } +} + +// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle. +impl Drop for Cache { + fn drop(&mut self) { + for (_, weak) in self.map.drain() { + if let Some(strong) = weak.upgrade() { + // handle is still being kept alive in PerTimelineState + let timeline = strong.timeline.per_timeline_state(); + let mut handles = timeline.handles.lock().expect("mutex poisoned"); + if let Some(handles) = &mut *handles { + let Some(removed) = handles.remove(&self.id) else { + // There could have been a shutdown inbetween us upgrading the weak and locking the mutex. + continue; + }; + assert!(Arc::ptr_eq(&removed, &strong)); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use pageserver_api::{ + key::{rel_block_to_key, Key, DBDIR_KEY}, + models::ShardParameters, + reltag::RelTag, + shard::ShardStripeSize, + }; + use utils::shard::ShardCount; + + use super::*; + + const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX); + + #[derive(Debug)] + struct TestTypes; + impl Types for TestTypes { + type TenantManagerError = anyhow::Error; + type TenantManager = StubManager; + type Timeline = Arc; + } + + struct StubManager { + shards: Vec>, + } + + struct StubTimeline { + gate: utils::sync::gate::Gate, + id: TimelineId, + shard: ShardIdentity, + per_timeline_state: PerTimelineState, + myself: Weak, + } + + impl StubTimeline { + fn getpage(&self) { + // do nothing + } + } + + impl ArcTimeline for Arc { + fn gate(&self) -> &utils::sync::gate::Gate { + &self.gate + } + + fn shard_timeline_id(&self) -> ShardTimelineId { + ShardTimelineId { + shard_index: self.shard.shard_index(), + timeline_id: self.id, + } + } + + fn get_shard_identity(&self) -> &ShardIdentity { + &self.shard + } + + fn per_timeline_state(&self) -> &PerTimelineState { + &self.per_timeline_state + } + } + + impl TenantManager for StubManager { + async fn resolve( + &self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> anyhow::Result> { + for timeline in &self.shards { + if timeline.id == timeline_id { + match &shard_selector { + ShardSelector::Zero if timeline.shard.is_shard_zero() => { + return Ok(Arc::clone(timeline)); + } + ShardSelector::Zero => continue, + ShardSelector::Page(key) if timeline.shard.is_key_local(key) => { + return Ok(Arc::clone(timeline)); + } + ShardSelector::Page(_) => continue, + ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => { + return Ok(Arc::clone(timeline)); + } + ShardSelector::Known(_) => continue, + } + } + } + anyhow::bail!("not found") + } + } + + #[tokio::test(start_paused = true)] + async fn test_timeline_shutdown() { + crate::tenant::harness::setup_logging(); + + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + // + // fill the cache + // + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (2, 1), + "strong: shard0, mgr; weak: myself" + ); + + let handle: Handle<_> = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + let handle_inner_weak = Arc::downgrade(&handle.0); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + assert_eq!( + ( + Weak::strong_count(&handle_inner_weak), + Weak::weak_count(&handle_inner_weak) + ), + (2, 2), + "strong: handle, per_timeline_state, weak: handle_inner_weak, cache" + ); + assert_eq!(cache.map.len(), 1); + + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (3, 1), + "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" + ); + drop(handle); + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (3, 1), + "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" + ); + + // + // demonstrate that Handle holds up gate closure + // but shutdown prevents new handles from being handed out + // + + tokio::select! { + _ = shard0.gate.close() => { + panic!("cache and per-timeline handler state keep cache open"); + } + _ = tokio::time::sleep(FOREVER) => { + // NB: first poll of close() makes it enter closing state + } + } + + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + + // SHUTDOWN + shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown + + assert_eq!( + 1, + Weak::strong_count(&handle_inner_weak), + "through local var handle" + ); + assert_eq!( + cache.map.len(), + 1, + "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after" + ); + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (3, 1), + "strong: handleinner(via handle), shard0, mgr; weak: myself" + ); + + // this handle is perfectly usable + handle.getpage(); + + cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .err() + .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle"); + assert_eq!( + cache.map.len(), + 0, + "first access after shutdown cleans up the Weak's from the cache" + ); + + tokio::select! { + _ = shard0.gate.close() => { + panic!("handle is keeping gate open"); + } + _ = tokio::time::sleep(FOREVER) => { } + } + + drop(handle); + assert_eq!( + 0, + Weak::strong_count(&handle_inner_weak), + "the HandleInner destructor already ran" + ); + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (2, 1), + "strong: shard0, mgr; weak: myself" + ); + + // closing gate succeeds after dropping handle + tokio::select! { + _ = shard0.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("handle is dropped, no other gate holders exist") + } + } + + // map gets cleaned on next lookup + cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .err() + .expect("documented behavior: can't get new handle after shutdown"); + assert_eq!(cache.map.len(), 0); + + // ensure all refs to shard0 are gone and we're not leaking anything + let myself = Weak::clone(&shard0.myself); + drop(shard0); + drop(mgr); + assert_eq!(Weak::strong_count(&myself), 0); + } + + #[tokio::test] + async fn test_multiple_timelines_and_deletion() { + crate::tenant::harness::setup_logging(); + + let timeline_a = TimelineId::generate(); + let timeline_b = TimelineId::generate(); + assert_ne!(timeline_a, timeline_b); + let timeline_a = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_a, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let timeline_b = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_b, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mut mgr = StubManager { + shards: vec![timeline_a.clone(), timeline_b.clone()], + }; + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + cache + .get(timeline_a.id, ShardSelector::Page(key), &mgr) + .await + .expect("we have it"); + cache + .get(timeline_b.id, ShardSelector::Page(key), &mgr) + .await + .expect("we have it"); + assert_eq!(cache.map.len(), 2); + + // delete timeline A + timeline_a.per_timeline_state.shutdown(); + mgr.shards.retain(|t| t.id != timeline_a.id); + assert!( + mgr.resolve(timeline_a.id, ShardSelector::Page(key)) + .await + .is_err(), + "broken StubManager implementation" + ); + + assert_eq!( + cache.map.len(), + 2, + "cache still has a Weak handle to Timeline A" + ); + cache + .get(timeline_a.id, ShardSelector::Page(key), &mgr) + .await + .err() + .expect("documented behavior: can't get new handle after shutdown"); + assert_eq!(cache.map.len(), 1, "next access cleans up the cache"); + + cache + .get(timeline_b.id, ShardSelector::Page(key), &mgr) + .await + .expect("we still have it"); + } + + fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key { + rel_block_to_key( + RelTag { + spcnode: 1663, + dbnode: 208101, + relnode: 2620, + forknum: 0, + }, + shard.0 as u32 * params.stripe_size.0, + ) + } + + #[tokio::test(start_paused = true)] + async fn test_shard_split() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let parent = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let child_params = ShardParameters { + count: ShardCount(2), + stripe_size: ShardStripeSize::default(), + }; + let child0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::from_params(ShardNumber(0), &child_params), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let child1 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::from_params(ShardNumber(1), &child_params), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let child_shards_by_shard_number = [child0.clone(), child1.clone()]; + + let mut cache = Cache::::default(); + + // fill the cache with the parent + for i in 0..2 { + let handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + &StubManager { + shards: vec![parent.clone()], + }, + ) + .await + .expect("we have it"); + assert!( + Weak::ptr_eq(&handle.myself, &parent.myself), + "mgr returns parent first" + ); + drop(handle); + } + + // + // SHARD SPLIT: tenant manager changes, but the cache isn't informed + // + + // while we haven't shut down the parent, the cache will return the cached parent, even + // if the tenant manager returns the child + for i in 0..2 { + let handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + &StubManager { + shards: vec![], // doesn't matter what's in here, the cache is fully loaded + }, + ) + .await + .expect("we have it"); + assert!( + Weak::ptr_eq(&handle.myself, &parent.myself), + "mgr returns parent" + ); + drop(handle); + } + + let parent_handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)), + &StubManager { + shards: vec![parent.clone()], + }, + ) + .await + .expect("we have it"); + assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself)); + + // invalidate the cache + parent.per_timeline_state.shutdown(); + + // the cache will now return the child, even though the parent handle still exists + for i in 0..2 { + let handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + &StubManager { + shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop + }, + ) + .await + .expect("we have it"); + assert!( + Weak::ptr_eq( + &handle.myself, + &child_shards_by_shard_number[i as usize].myself + ), + "mgr returns child" + ); + drop(handle); + } + + // all the while the parent handle kept the parent gate open + tokio::select! { + _ = parent_handle.gate.close() => { + panic!("parent handle is keeping gate open"); + } + _ = tokio::time::sleep(FOREVER) => { } + } + drop(parent_handle); + tokio::select! { + _ = parent.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("parent handle is dropped, no other gate holders exist") + } + } + } + + #[tokio::test(start_paused = true)] + async fn test_connection_handler_exit() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + let key = DBDIR_KEY; + + // Simulate 10 connections that's opened, used, and closed + let mut used_handles = vec![]; + for _ in 0..10 { + let mut cache = Cache::::default(); + let handle = { + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + handle + }; + handle.getpage(); + used_handles.push(Arc::downgrade(&handle.0)); + } + + // No handles exist, thus gates are closed and don't require shutdown + assert!(used_handles + .iter() + .all(|weak| Weak::strong_count(weak) == 0)); + + // ... thus the gate should close immediately, even without shutdown + tokio::select! { + _ = shard0.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("handle is dropped, no other gate holders exist") + } + } + } +} diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 550a9a567a..8f20d84401 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,4 +1,4 @@ -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{bail, ensure, Context}; use itertools::Itertools; use pageserver_api::shard::TenantShardId; use std::{collections::HashMap, sync::Arc}; @@ -24,35 +24,142 @@ use crate::{ use super::TimelineWriterState; /// Provides semantic APIs to manipulate the layer map. -#[derive(Default)] -pub(crate) struct LayerManager { - layer_map: LayerMap, - layer_fmgr: LayerFileManager, +pub(crate) enum LayerManager { + /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate + /// the layers. + Open(OpenLayerManager), + /// Shutdown layer manager where there are no more in-memory layers and persistent layers are + /// read-only. + Closed { + layers: HashMap, + }, +} + +impl Default for LayerManager { + fn default() -> Self { + LayerManager::Open(OpenLayerManager::default()) + } } impl LayerManager { + pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer { + // The assumption for the `expect()` is that all code maintains the following invariant: + // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. + self.layers() + .get(key) + .with_context(|| format!("get layer from key: {key}")) + .expect("not found") + .clone() + } + pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer { - self.layer_fmgr.get_from_desc(desc) + self.get_from_key(&desc.key()) } /// Get an immutable reference to the layer map. /// /// We expect users only to be able to get an immutable layer map. If users want to make modifications, /// they should use the below semantic APIs. This design makes us step closer to immutable storage state. - pub(crate) fn layer_map(&self) -> &LayerMap { - &self.layer_map + pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> { + use LayerManager::*; + match self { + Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map), + Closed { .. } => Err(Shutdown), + } } + pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> { + use LayerManager::*; + + match self { + Open(open) => Ok(open), + Closed { .. } => Err(Shutdown), + } + } + + /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in + /// order to allow shutdown to complete. + /// + /// If there was a want to flush in-memory layers, it must have happened earlier. + pub(crate) fn shutdown(&mut self, writer_state: &mut Option) { + use LayerManager::*; + match self { + Open(OpenLayerManager { + layer_map, + layer_fmgr: LayerFileManager(hashmap), + }) => { + let open = layer_map.open_layer.take(); + let frozen = layer_map.frozen_layers.len(); + let taken_writer_state = writer_state.take(); + tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers"); + let layers = std::mem::take(hashmap); + *self = Closed { layers }; + assert_eq!(open.is_some(), taken_writer_state.is_some()); + } + Closed { .. } => { + tracing::debug!("ignoring multiple shutdowns on layer manager") + } + } + } + + /// Sum up the historic layer sizes + pub(crate) fn layer_size_sum(&self) -> u64 { + self.layers() + .values() + .map(|l| l.layer_desc().file_size) + .sum() + } + + pub(crate) fn likely_resident_layers(&self) -> impl Iterator + '_ { + self.layers().values().filter(|l| l.is_likely_resident()) + } + + pub(crate) fn contains(&self, layer: &Layer) -> bool { + self.contains_key(&layer.layer_desc().key()) + } + + pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool { + self.layers().contains_key(key) + } + + pub(crate) fn all_persistent_layers(&self) -> Vec { + self.layers().keys().cloned().collect_vec() + } + + fn layers(&self) -> &HashMap { + use LayerManager::*; + match self { + Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0, + Closed { layers } => layers, + } + } +} + +#[derive(Default)] +pub(crate) struct OpenLayerManager { + layer_map: LayerMap, + layer_fmgr: LayerFileManager, +} + +impl std::fmt::Debug for OpenLayerManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("OpenLayerManager") + .field("layer_count", &self.layer_fmgr.0.len()) + .finish() + } +} + +#[derive(Debug, thiserror::Error)] +#[error("layer manager has been shutdown")] +pub(crate) struct Shutdown; + +impl OpenLayerManager { /// Called from `load_layer_map`. Initialize the layer manager with: /// 1. all on-disk layers /// 2. next open layer (with disk disk_consistent_lsn LSN) - pub(crate) fn initialize_local_layers( - &mut self, - on_disk_layers: Vec, - next_open_layer_at: Lsn, - ) { + pub(crate) fn initialize_local_layers(&mut self, layers: Vec, next_open_layer_at: Lsn) { let mut updates = self.layer_map.batch_update(); - for layer in on_disk_layers { + for layer in layers { Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr); } updates.flush(); @@ -64,26 +171,19 @@ impl LayerManager { self.layer_map.next_open_layer_at = Some(next_open_layer_at); } - /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer, - /// called within `get_layer_for_write`. + /// Open a new writable layer to append data if there is no open layer, otherwise return the + /// current open layer, called within `get_layer_for_write`. pub(crate) async fn get_layer_for_write( &mut self, lsn: Lsn, - last_record_lsn: Lsn, conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, + gate_guard: utils::sync::gate::GateGuard, ctx: &RequestContext, - ) -> Result> { + ) -> anyhow::Result> { ensure!(lsn.is_aligned()); - ensure!( - lsn > last_record_lsn, - "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", - lsn, - last_record_lsn, - ); - // Do we have a layer open for writing already? let layer = if let Some(open_layer) = &self.layer_map.open_layer { if open_layer.get_lsn_range().start > lsn { @@ -109,8 +209,15 @@ impl LayerManager { lsn ); - let new_layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?; + let new_layer = InMemoryLayer::create( + conf, + timeline_id, + tenant_shard_id, + start_lsn, + gate_guard, + ctx, + ) + .await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); @@ -164,7 +271,7 @@ impl LayerManager { froze } - /// Add image layers to the layer map, called from `create_image_layers`. + /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`]. pub(crate) fn track_new_image_layers( &mut self, image_layers: &[ResidentLayer], @@ -227,7 +334,6 @@ impl LayerManager { } /// Called when a GC-compaction is completed. - #[cfg(test)] pub(crate) fn finish_gc_compaction( &mut self, compact_from: &[Layer], @@ -238,7 +344,7 @@ impl LayerManager { self.finish_compact_l0(compact_from, compact_to, metrics) } - /// Called when compaction is completed. + /// Called post-compaction when some previous generation image layers were trimmed. pub(crate) fn rewrite_layers( &mut self, rewrite_layers: &[(Layer, ResidentLayer)], @@ -256,6 +362,11 @@ impl LayerManager { new_layer.layer_desc().lsn_range ); + // Transfer visibility hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to + // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents + // always marking rewritten layers as visible. + new_layer.as_ref().set_visibility(old_layer.visibility()); + // Safety: we may never rewrite the same file in-place. Callers are responsible // for ensuring that they only rewrite layers after something changes the path, // such as an increment in the generation number. @@ -322,27 +433,6 @@ impl LayerManager { mapping.remove(layer); layer.delete_on_drop(); } - - pub(crate) fn likely_resident_layers(&self) -> impl Iterator + '_ { - // for small layer maps, we most likely have all resident, but for larger more are likely - // to be evicted assuming lots of layers correlated with longer lifespan. - - self.layer_map().iter_historic_layers().filter_map(|desc| { - self.layer_fmgr - .0 - .get(&desc.key()) - .filter(|l| l.is_likely_resident()) - .cloned() - }) - } - - pub(crate) fn contains(&self, layer: &Layer) -> bool { - self.layer_fmgr.contains(layer) - } - - pub(crate) fn all_persistent_layers(&self) -> Vec { - self.layer_fmgr.0.keys().cloned().collect_vec() - } } pub(crate) struct LayerFileManager(HashMap); @@ -354,16 +444,6 @@ impl Default for LayerFileManager { } impl LayerFileManager { - fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T { - // The assumption for the `expect()` is that all code maintains the following invariant: - // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. - self.0 - .get(&desc.key()) - .with_context(|| format!("get layer from desc: {}", desc.layer_name())) - .expect("not found") - .clone() - } - pub(crate) fn insert(&mut self, layer: T) { let present = self.0.insert(layer.layer_desc().key(), layer.clone()); if present.is_some() && cfg!(debug_assertions) { @@ -371,10 +451,6 @@ impl LayerFileManager { } } - pub(crate) fn contains(&self, layer: &T) -> bool { - self.0.contains_key(&layer.layer_desc().key()) - } - pub(crate) fn remove(&mut self, layer: &T) { let present = self.0.remove(&layer.layer_desc().key()); if present.is_none() && cfg!(debug_assertions) { diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index 8f9ca0e29f..f4a4eea54a 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -11,11 +11,11 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; /// Calculation consists of two stages: /// /// 1. Initial size calculation. That might take a long time, because it requires -/// reading all layers containing relation sizes at `initial_part_end`. +/// reading all layers containing relation sizes at `initial_part_end`. /// /// 2. Collecting an incremental part and adding that to the initial size. -/// Increments are appended on walreceiver writing new timeline data, -/// which result in increase or decrease of the logical size. +/// Increments are appended on walreceiver writing new timeline data, +/// which result in increase or decrease of the logical size. pub(super) struct LogicalSize { /// Size, potentially slow to compute. Calculating this might require reading multiple /// layers, and even ancestor's layers. @@ -45,17 +45,17 @@ pub(super) struct LogicalSize { /// Size shouldn't ever be negative, but this is signed for two reasons: /// /// 1. If we initialized the "baseline" size lazily, while we already - /// process incoming WAL, the incoming WAL records could decrement the - /// variable and temporarily make it negative. (This is just future-proofing; - /// the initialization is currently not done lazily.) + /// process incoming WAL, the incoming WAL records could decrement the + /// variable and temporarily make it negative. (This is just future-proofing; + /// the initialization is currently not done lazily.) /// /// 2. If there is a bug and we e.g. forget to increment it in some cases - /// when size grows, but remember to decrement it when it shrinks again, the - /// variable could go negative. In that case, it seems better to at least - /// try to keep tracking it, rather than clamp or overflow it. Note that - /// get_current_logical_size() will clamp the returned value to zero if it's - /// negative, and log an error. Could set it permanently to zero or some - /// special value to indicate "broken" instead, but this will do for now. + /// when size grows, but remember to decrement it when it shrinks again, the + /// variable could go negative. In that case, it seems better to at least + /// try to keep tracking it, rather than clamp or overflow it. Note that + /// get_current_logical_size() will clamp the returned value to zero if it's + /// negative, and log an error. Could set it permanently to zero or some + /// special value to indicate "broken" instead, but this will do for now. /// /// Note that we also expose a copy of this value as a prometheus metric, /// see `current_logical_size_gauge`. Use the `update_current_logical_size` @@ -122,6 +122,10 @@ impl CurrentLogicalSize { Self::Exact(_) => Accuracy::Exact, } } + + pub(crate) fn is_exact(&self) -> bool { + matches!(self, Self::Exact(_)) + } } impl LogicalSize { diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index a085154a5a..4a3a5c621b 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -2,13 +2,13 @@ //! To do so, a current implementation needs to do the following: //! //! * acknowledge the timelines that it needs to stream WAL into. -//! Pageserver is able to dynamically (un)load tenants on attach and detach, -//! hence WAL receiver needs to react on such events. +//! Pageserver is able to dynamically (un)load tenants on attach and detach, +//! hence WAL receiver needs to react on such events. //! //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming. -//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically. -//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. -//! Without this data, no WAL streaming is possible currently. +//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically. +//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. +//! Without this data, no WAL streaming is possible currently. //! //! Only one active WAL streaming connection is allowed at a time. //! The connection is supposed to be updated periodically, based on safekeeper timeline data. diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 1d2ffec08f..de50f217d8 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -1118,7 +1118,7 @@ mod tests { #[tokio::test] async fn no_connection_no_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("no_connection_no_candidate")?; + let harness = TenantHarness::create("no_connection_no_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1151,7 +1151,7 @@ mod tests { #[tokio::test] async fn connection_no_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("connection_no_candidate")?; + let harness = TenantHarness::create("connection_no_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1216,7 +1216,7 @@ mod tests { #[tokio::test] async fn no_connection_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("no_connection_candidate")?; + let harness = TenantHarness::create("no_connection_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1279,7 +1279,7 @@ mod tests { #[tokio::test] async fn candidate_with_many_connection_failures() -> anyhow::Result<()> { - let harness = TenantHarness::create("candidate_with_many_connection_failures")?; + let harness = TenantHarness::create("candidate_with_many_connection_failures").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1319,7 +1319,7 @@ mod tests { #[tokio::test] async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1385,7 +1385,8 @@ mod tests { #[tokio::test] async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?; + let harness = + TenantHarness::create("timeout_connection_threshold_current_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1448,7 +1449,7 @@ mod tests { #[tokio::test] async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?; + let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let new_lsn = Lsn(100_100).align(); @@ -1550,7 +1551,7 @@ mod tests { // and pageserver should prefer to connect to it. let test_az = Some("test_az".to_owned()); - let harness = TenantHarness::create("switch_to_same_availability_zone")?; + let harness = TenantHarness::create("switch_to_same_availability_zone").await?; let mut state = dummy_state(&harness).await; state.conf.availability_zone.clone_from(&test_az); let current_lsn = Lsn(100_000).align(); diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index c6ee6b90c4..0114473eda 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -26,9 +26,9 @@ use tracing::{debug, error, info, trace, warn, Instrument}; use super::TaskStateUpdate; use crate::{ context::RequestContext, - metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - task_mgr::TaskKind, - task_mgr::WALRECEIVER_RUNTIME, + metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, + pgdatadir_mapping::DatadirModification, + task_mgr::{TaskKind, WALRECEIVER_RUNTIME}, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, walingest::WalIngest, walrecord::DecodedWALRecord, @@ -208,14 +208,9 @@ pub(super) async fn handle_walreceiver_connection( .instrument(tracing::info_span!("poller")), ); - // Immediately increment the gauge, then create a job to decrement it on task exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); - } + let _guard = LIVE_CONNECTIONS + .with_label_values(&["wal_receiver"]) + .guard(); let identify = identify_system(&replication_client).await?; info!("{identify:?}"); @@ -340,6 +335,9 @@ pub(super) async fn handle_walreceiver_connection( filtered_records += 1; } + // FIXME: this cannot be made pausable_failpoint without fixing the + // failpoint library; in tests, the added amount of debugging will cause us + // to timeout the tests. fail_point!("walreceiver-after-ingest"); last_rec_lsn = lsn; @@ -347,7 +345,10 @@ pub(super) async fn handle_walreceiver_connection( // Commit every ingest_batch_size records. Even if we filtered out // all records, we still need to call commit to advance the LSN. uncommitted_records += 1; - if uncommitted_records >= ingest_batch_size { + if uncommitted_records >= ingest_batch_size + || modification.approx_pending_bytes() + > DatadirModification::MAX_PENDING_BYTES + { WAL_INGEST .records_committed .inc_by(uncommitted_records - filtered_records); diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 50c977a950..592f41cb21 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -130,7 +130,7 @@ pub(super) enum UploadQueueStopped { } #[derive(thiserror::Error, Debug)] -pub(crate) enum NotInitialized { +pub enum NotInitialized { #[error("queue is in state Uninitialized")] Uninitialized, #[error("queue is in state Stopped")] @@ -228,18 +228,20 @@ impl UploadQueue { Ok(self.initialized_mut().expect("we just set it")) } - pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> { + pub(crate) fn initialized_mut( + &mut self, + ) -> Result<&mut UploadQueueInitialized, NotInitialized> { use UploadQueue::*; match self { - Uninitialized => Err(NotInitialized::Uninitialized.into()), + Uninitialized => Err(NotInitialized::Uninitialized), Initialized(x) => { if x.shutting_down { - Err(NotInitialized::ShuttingDown.into()) + Err(NotInitialized::ShuttingDown) } else { Ok(x) } } - Stopped(_) => Err(NotInitialized::Stopped.into()), + Stopped(_) => Err(NotInitialized::Stopped), } } diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 6e825760e3..54a3ad789b 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -20,10 +20,13 @@ use std::num::NonZeroUsize; use bytes::BytesMut; use pageserver_api::key::Key; +use tokio::io::AsyncWriteExt; +use tokio_epoll_uring::BoundedBuf; use utils::lsn::Lsn; use utils::vec_map::VecMap; use crate::context::RequestContext; +use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; use crate::virtual_file::VirtualFile; #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -67,7 +70,7 @@ impl VectoredRead { } } -#[derive(Eq, PartialEq)] +#[derive(Eq, PartialEq, Debug)] pub(crate) enum VectoredReadExtended { Yes, No, @@ -77,7 +80,7 @@ pub(crate) struct VectoredReadBuilder { start: u64, end: u64, blobs_at: VecMap, - max_read_size: usize, + max_read_size: Option, } impl VectoredReadBuilder { @@ -101,17 +104,22 @@ impl VectoredReadBuilder { start: start_offset, end: end_offset, blobs_at, - max_read_size, + max_read_size: Some(max_read_size), } } - /// Attempt to extend the current read with a new blob if the start /// offset matches with the current end of the vectored read /// and the resuting size is below the max read size pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { tracing::trace!(start, end, "trying to extend"); let size = (end - start) as usize; - if self.end == start && self.size() + size <= self.max_read_size { + if self.end == start && { + if let Some(max_read_size) = self.max_read_size { + self.size() + size <= max_read_size + } else { + true + } + } { self.end = end; self.blobs_at .append(start, meta) @@ -183,9 +191,9 @@ impl VectoredReadPlanner { /// /// The `flag` argument has two interesting values: /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs. - /// This is used for WAL records that `will_init`. + /// This is used for WAL records that `will_init`. /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens - /// if the blob is cached. + /// if the blob is cached. pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) { // Implementation note: internally lag behind by one blob such that // we have a start and end offset when initialising [`VectoredRead`] @@ -295,10 +303,11 @@ impl<'a> VectoredBlobReader<'a> { read.size(), buf.capacity() ); - let buf = self + let mut buf = self .file - .read_exact_at_n(buf, read.start, read.size(), ctx) - .await?; + .read_exact_at(buf.slice(0..read.size()), read.start, ctx) + .await? + .into_inner(); let blobs_at = read.blobs_at.as_slice(); let start_offset = blobs_at.first().expect("VectoredRead is never empty").0; @@ -316,46 +325,189 @@ impl<'a> VectoredBlobReader<'a> { .chain(std::iter::once(None)), ); + // Some scratch space, put here for reusing the allocation + let mut decompressed_vec = Vec::new(); + for ((offset, meta), next) in pairs { let offset_in_buf = offset - start_offset; let first_len_byte = buf[offset_in_buf as usize]; - // Each blob is prefixed by a header containing it's size. + // Each blob is prefixed by a header containing its size and compression information. // Extract the size and skip that header to find the start of the data. // The size can be 1 or 4 bytes. The most significant bit is 0 in the // 1 byte case and 1 in the 4 byte case. - let (size_length, blob_size) = if first_len_byte < 0x80 { - (1, first_len_byte as u64) + let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 { + (1, first_len_byte as u64, BYTE_UNCOMPRESSED) } else { let mut blob_size_buf = [0u8; 4]; let offset_in_buf = offset_in_buf as usize; blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]); - blob_size_buf[0] &= 0x7f; - (4, u32::from_be_bytes(blob_size_buf) as u64) + blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK; + + let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK; + ( + 4, + u32::from_be_bytes(blob_size_buf) as u64, + compression_bits, + ) }; - let start = offset_in_buf + size_length; - let end = match next { + let start_raw = offset_in_buf + size_length; + let end_raw = match next { Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset, - None => start + blob_size, + None => start_raw + blob_size, }; - - assert_eq!(end - start, blob_size); + assert_eq!(end_raw - start_raw, blob_size); + let (start, end); + if compression_bits == BYTE_UNCOMPRESSED { + start = start_raw as usize; + end = end_raw as usize; + } else if compression_bits == BYTE_ZSTD { + let mut decoder = + async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec); + decoder + .write_all(&buf[start_raw as usize..end_raw as usize]) + .await?; + decoder.flush().await?; + start = buf.len(); + buf.extend_from_slice(&decompressed_vec); + end = buf.len(); + decompressed_vec.clear(); + } else { + let error = std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("invalid compression byte {compression_bits:x}"), + ); + return Err(error); + } metas.push(VectoredBlob { - start: start as usize, - end: end as usize, + start, + end, meta: *meta, - }) + }); } Ok(VectoredBlobsBuf { buf, blobs: metas }) } } +/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for +/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and +/// max_cnt constraints. +pub struct StreamingVectoredReadPlanner { + read_builder: Option, + // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`] + prev: Option<(Key, Lsn, u64)>, + /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150, + /// we will produce a single batch instead of split them. + max_read_size: u64, + /// Max item count per batch + max_cnt: usize, + /// Size of the current batch + cnt: usize, +} + +impl StreamingVectoredReadPlanner { + pub fn new(max_read_size: u64, max_cnt: usize) -> Self { + assert!(max_cnt > 0); + assert!(max_read_size > 0); + Self { + read_builder: None, + prev: None, + max_cnt, + max_read_size, + cnt: 0, + } + } + + pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option { + // Implementation note: internally lag behind by one blob such that + // we have a start and end offset when initialising [`VectoredRead`] + let (prev_key, prev_lsn, prev_offset) = match self.prev { + None => { + self.prev = Some((key, lsn, offset)); + return None; + } + Some(prev) => prev, + }; + + let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false); + + self.prev = Some((key, lsn, offset)); + + res + } + + pub fn handle_range_end(&mut self, offset: u64) -> Option { + let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev { + self.add_blob(prev_key, prev_lsn, prev_offset, offset, true) + } else { + None + }; + + self.prev = None; + + res + } + + fn add_blob( + &mut self, + key: Key, + lsn: Lsn, + start_offset: u64, + end_offset: u64, + is_last_blob_in_read: bool, + ) -> Option { + match &mut self.read_builder { + Some(read_builder) => { + let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }); + assert_eq!(extended, VectoredReadExtended::Yes); + } + None => { + self.read_builder = { + let mut blobs_at = VecMap::default(); + blobs_at + .append(start_offset, BlobMeta { key, lsn }) + .expect("First insertion always succeeds"); + + Some(VectoredReadBuilder { + start: start_offset, + end: end_offset, + blobs_at, + max_read_size: None, + }) + }; + } + } + let read_builder = self.read_builder.as_mut().unwrap(); + self.cnt += 1; + if is_last_blob_in_read + || read_builder.size() >= self.max_read_size as usize + || self.cnt >= self.max_cnt + { + let prev_read_builder = self.read_builder.take(); + self.cnt = 0; + + // `current_read_builder` is None in the first iteration + if let Some(read_builder) = prev_read_builder { + return Some(read_builder.build()); + } + } + None + } +} + #[cfg(test)] mod tests { + use anyhow::Error; + + use crate::context::DownloadBehavior; + use crate::page_cache::PAGE_SZ; + use crate::task_mgr::TaskKind; + + use super::super::blob_io::tests::{random_array, write_maybe_compressed}; use super::*; fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { @@ -407,8 +559,11 @@ mod tests { planner.handle_range_end(652 * 1024); let reads = planner.finish(); + assert_eq!(reads.len(), 6); + // TODO: could remove zero reads to produce 5 reads here + for (idx, read) in reads.iter().enumerate() { validate_read(read, ranges[idx]); } @@ -446,4 +601,187 @@ mod tests { validate_read(read, ranges[idx]); } } + + #[test] + fn streaming_planner_max_read_size_test() { + let max_read_size = 128 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 32 * 1024, BlobFlag::None), + (key, lsn, 96 * 1024, BlobFlag::None), + (key, lsn, 128 * 1024, BlobFlag::None), + (key, lsn, 198 * 1024, BlobFlag::None), + (key, lsn, 268 * 1024, BlobFlag::None), + (key, lsn, 396 * 1024, BlobFlag::None), + (key, lsn, 652 * 1024, BlobFlag::None), + ]; + + let ranges = [ + &blob_descriptions[0..3], + &blob_descriptions[3..5], + &blob_descriptions[5..6], + &blob_descriptions[6..7], + &blob_descriptions[7..], + ]; + + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000); + let mut reads = Vec::new(); + for (key, lsn, offset, _) in blob_descriptions.clone() { + reads.extend(planner.handle(key, lsn, offset)); + } + reads.extend(planner.handle_range_end(652 * 1024)); + + assert_eq!(reads.len(), ranges.len()); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn streaming_planner_max_cnt_test() { + let max_read_size = 1024 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 32 * 1024, BlobFlag::None), + (key, lsn, 96 * 1024, BlobFlag::None), + (key, lsn, 128 * 1024, BlobFlag::None), + (key, lsn, 198 * 1024, BlobFlag::None), + (key, lsn, 268 * 1024, BlobFlag::None), + (key, lsn, 396 * 1024, BlobFlag::None), + (key, lsn, 652 * 1024, BlobFlag::None), + ]; + + let ranges = [ + &blob_descriptions[0..2], + &blob_descriptions[2..4], + &blob_descriptions[4..6], + &blob_descriptions[6..], + ]; + + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); + let mut reads = Vec::new(); + for (key, lsn, offset, _) in blob_descriptions.clone() { + reads.extend(planner.handle(key, lsn, offset)); + } + reads.extend(planner.handle_range_end(652 * 1024)); + + assert_eq!(reads.len(), ranges.len()); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn streaming_planner_edge_test() { + let max_read_size = 1024 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); + let mut reads = Vec::new(); + reads.extend(planner.handle_range_end(652 * 1024)); + assert!(reads.is_empty()); + } + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); + let mut reads = Vec::new(); + reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle_range_end(652 * 1024)); + assert_eq!(reads.len(), 1); + validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); + } + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); + let mut reads = Vec::new(); + reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle_range_end(652 * 1024)); + assert_eq!(reads.len(), 2); + validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); + validate_read(&reads[1], &[(key, lsn, 128 * 1024, BlobFlag::None)]); + } + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); + let mut reads = Vec::new(); + reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle_range_end(652 * 1024)); + assert_eq!(reads.len(), 1); + validate_read( + &reads[0], + &[ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 128 * 1024, BlobFlag::None), + ], + ); + } + } + + async fn round_trip_test_compressed(blobs: &[Vec], compression: bool) -> Result<(), Error> { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let (_temp_dir, pathbuf, offsets) = + write_maybe_compressed::(blobs, compression, &ctx).await?; + + let file = VirtualFile::open(&pathbuf, &ctx).await?; + let file_len = std::fs::metadata(&pathbuf)?.len(); + + // Multiply by two (compressed data might need more space), and add a few bytes for the header + let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; + let mut buf = BytesMut::with_capacity(reserved_bytes); + + let vectored_blob_reader = VectoredBlobReader::new(&file); + let meta = BlobMeta { + key: Key::MIN, + lsn: Lsn(0), + }; + + for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { + let end = offsets.get(idx + 1).unwrap_or(&file_len); + if idx + 1 == offsets.len() { + continue; + } + let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096); + let read = read_builder.build(); + let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?; + assert_eq!(result.blobs.len(), 1); + let read_blob = &result.blobs[0]; + let read_buf = &result.buf[read_blob.start..read_blob.end]; + assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}"); + buf = result.buf; + } + Ok(()) + } + + #[tokio::test] + async fn test_really_big_array() -> Result<(), Error> { + let blobs = &[ + b"test".to_vec(), + random_array(10 * PAGE_SZ), + b"hello".to_vec(), + random_array(66 * PAGE_SZ), + vec![0xf3; 24 * PAGE_SZ], + b"foobar".to_vec(), + ]; + round_trip_test_compressed(blobs, false).await?; + round_trip_test_compressed(blobs, true).await?; + Ok(()) + } + + #[tokio::test] + async fn test_arrays_inc() -> Result<(), Error> { + let blobs = (0..PAGE_SZ / 8) + .map(|v| random_array(v * 16)) + .collect::>(); + round_trip_test_compressed(&blobs, false).await?; + round_trip_test_compressed(&blobs, true).await?; + Ok(()) + } } diff --git a/pageserver/src/trace.rs b/pageserver/src/trace.rs deleted file mode 100644 index 18ec269198..0000000000 --- a/pageserver/src/trace.rs +++ /dev/null @@ -1,36 +0,0 @@ -use bytes::Bytes; -use camino::Utf8PathBuf; -use std::{ - fs::{create_dir_all, File}, - io::{BufWriter, Write}, -}; - -pub struct Tracer { - writer: BufWriter, -} - -impl Drop for Tracer { - fn drop(&mut self) { - self.flush() - } -} - -impl Tracer { - pub fn new(path: Utf8PathBuf) -> Self { - let parent = path.parent().expect("failed to parse parent path"); - create_dir_all(parent).expect("failed to create trace dir"); - - let file = File::create(path).expect("failed to create trace file"); - Tracer { - writer: BufWriter::new(file), - } - } - - pub fn trace(&mut self, msg: &Bytes) { - self.writer.write_all(msg).expect("failed to write trace"); - } - - pub fn flush(&mut self) { - self.writer.flush().expect("failed to flush trace file"); - } -} diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index e6c835aa75..3c48c84598 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -5,12 +5,17 @@ use anyhow::Context; use std::path::Path; +use utils::serde_percent::Percent; use pageserver_api::models::PageserverUtilization; -pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result { - // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough +use crate::{config::PageServerConf, tenant::mgr::TenantManager}; +pub(crate) fn regenerate( + conf: &PageServerConf, + tenants_path: &Path, + tenant_manager: &TenantManager, +) -> anyhow::Result { let statvfs = nix::sys::statvfs::statvfs(tenants_path) .map_err(std::io::Error::from) .context("statvfs tenants directory")?; @@ -34,16 +39,31 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result e.max_usage_pct, + None => Percent::new(100).unwrap(), + }; + + // Express a static value for how many shards we may schedule on one node + const MAX_SHARDS: u32 = 20000; + + let mut doc = PageserverUtilization { disk_usage_bytes: used, free_space_bytes: free, - // lower is better; start with a constant - // - // note that u64::MAX will be output as i64::MAX as u64, but that should not matter - utilization_score: u64::MAX, + disk_wanted_bytes, + disk_usable_pct, + shard_count, + max_shard_count: MAX_SHARDS, + utilization_score: 0, captured_at: utils::serde_system_time::SystemTime(captured_at), }; + doc.refresh_score(); + // TODO: make utilization_score into a metric Ok(doc) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 04d9386fab..c0017280fd 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -13,10 +13,11 @@ use crate::context::RequestContext; use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; -use crate::page_cache::PageWriteGuard; +use crate::page_cache::{PageWriteGuard, PAGE_SZ}; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; +use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::shard::TenantShardId; use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; @@ -30,10 +31,12 @@ use tokio::time::Instant; pub use pageserver_api::models::virtual_file as api; pub(crate) mod io_engine; pub use io_engine::feature_test as io_engine_feature_test; +pub use io_engine::io_engine_for_bench; pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; mod metadata; mod open_options; use self::owned_buffers_io::write::OwnedAsyncWriter; +pub(crate) use api::DirectIoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; @@ -48,6 +51,8 @@ pub(crate) mod owned_buffers_io { //! but for the time being we're proving out the primitives in the neon.git repo //! for faster iteration. + pub(crate) mod io_buf_ext; + pub(crate) mod slice; pub(crate) mod write; pub(crate) mod util { pub(crate) mod size_tracking_writer; @@ -143,16 +148,17 @@ struct SlotInner { /// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`]. struct PageWriteGuardBuf { page: PageWriteGuard<'static>, - init_up_to: usize, } // Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot, // and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved. +// Page cache pages are zero-initialized, so, wrt uninitialized memory we're good. +// (Page cache tracks separately whether the contents are valid, see `PageWriteGuard::mark_valid`.) unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf { fn stable_ptr(&self) -> *const u8 { self.page.as_ptr() } fn bytes_init(&self) -> usize { - self.init_up_to + self.page.len() } fn bytes_total(&self) -> usize { self.page.len() @@ -166,8 +172,8 @@ unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf { } unsafe fn set_init(&mut self, pos: usize) { + // There shouldn't really be any reason to call this API since bytes_init() == bytes_total(). assert!(pos <= self.page.len()); - self.init_up_to = pos; } } @@ -585,37 +591,37 @@ impl VirtualFile { Ok(self.pos) } - pub async fn read_exact_at( + /// Read the file contents in range `offset..(offset + slice.bytes_total())` into `slice[0..slice.bytes_total()]`. + /// + /// The returned `Slice` is equivalent to the input `slice`, i.e., it's the same view into the same buffer. + pub async fn read_exact_at( &self, - buf: B, + slice: Slice, offset: u64, ctx: &RequestContext, - ) -> Result + ) -> Result, Error> where - B: IoBufMut + Send, + Buf: IoBufMut + Send, { - let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| { - self.read_at(buf, offset, ctx) - }) - .await; - res.map(|()| buf) - } + let assert_we_return_original_bounds = if cfg!(debug_assertions) { + Some((slice.stable_ptr() as usize, slice.bytes_total())) + } else { + None + }; - pub async fn read_exact_at_n( - &self, - buf: B, - offset: u64, - count: usize, - ctx: &RequestContext, - ) -> Result - where - B: IoBufMut + Send, - { - let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| { - self.read_at(buf, offset, ctx) - }) - .await; - res.map(|()| buf) + let original_bounds = slice.bounds(); + let (buf, res) = + read_exact_at_impl(slice, offset, |buf, offset| self.read_at(buf, offset, ctx)).await; + let res = res.map(|_| buf.slice(original_bounds)); + + if let Some(original_bounds) = assert_we_return_original_bounds { + if let Ok(slice) = &res { + let returned_bounds = (slice.stable_ptr() as usize, slice.bytes_total()); + assert_eq!(original_bounds, returned_bounds); + } + } + + res } /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`]. @@ -625,34 +631,32 @@ impl VirtualFile { offset: u64, ctx: &RequestContext, ) -> Result, Error> { - let buf = PageWriteGuardBuf { - page, - init_up_to: 0, - }; - let res = self.read_exact_at(buf, offset, ctx).await; - res.map(|PageWriteGuardBuf { page, .. }| page) - .map_err(|e| Error::new(ErrorKind::Other, e)) + let buf = PageWriteGuardBuf { page }.slice_full(); + debug_assert_eq!(buf.bytes_total(), PAGE_SZ); + self.read_exact_at(buf, offset, ctx) + .await + .map(|slice| slice.into_inner().page) } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 - pub async fn write_all_at, Buf: IoBuf + Send>( + pub async fn write_all_at( &self, - buf: B, + buf: FullSlice, mut offset: u64, ctx: &RequestContext, - ) -> (B::Buf, Result<(), Error>) { - let buf_len = buf.bytes_init(); - if buf_len == 0 { - return (Slice::into_inner(buf.slice_full()), Ok(())); - } - let mut buf = buf.slice(0..buf_len); + ) -> (FullSlice, Result<(), Error>) { + let buf = buf.into_raw_slice(); + let bounds = buf.bounds(); + let restore = + |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds)); + let mut buf = buf; while !buf.is_empty() { - let res; - (buf, res) = self.write_at(buf, offset, ctx).await; + let (tmp, res) = self.write_at(FullSlice::must_new(buf), offset, ctx).await; + buf = tmp.into_raw_slice(); match res { Ok(0) => { return ( - Slice::into_inner(buf), + restore(buf), Err(Error::new( std::io::ErrorKind::WriteZero, "failed to write whole buffer", @@ -664,33 +668,33 @@ impl VirtualFile { offset += n as u64; } Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return (Slice::into_inner(buf), Err(e)), + Err(e) => return (restore(buf), Err(e)), } } - (Slice::into_inner(buf), Ok(())) + (restore(buf), Ok(())) } - /// Writes `buf.slice(0..buf.bytes_init())`. - /// Returns the IoBuf that is underlying the BoundedBuf `buf`. - /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in. - /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant. - pub async fn write_all, Buf: IoBuf + Send>( + /// Writes `buf` to the file at the current offset. + /// + /// Panics if there is an uninitialized range in `buf`, as that is most likely a bug in the caller. + pub async fn write_all( &mut self, - buf: B, + buf: FullSlice, ctx: &RequestContext, - ) -> (B::Buf, Result) { - let nbytes = buf.bytes_init(); - if nbytes == 0 { - return (Slice::into_inner(buf.slice_full()), Ok(0)); - } - let mut buf = buf.slice(0..nbytes); + ) -> (FullSlice, Result) { + let buf = buf.into_raw_slice(); + let bounds = buf.bounds(); + let restore = + |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds)); + let nbytes = buf.len(); + let mut buf = buf; while !buf.is_empty() { - let res; - (buf, res) = self.write(buf, ctx).await; + let (tmp, res) = self.write(FullSlice::must_new(buf), ctx).await; + buf = tmp.into_raw_slice(); match res { Ok(0) => { return ( - Slice::into_inner(buf), + restore(buf), Err(Error::new( std::io::ErrorKind::WriteZero, "failed to write whole buffer", @@ -701,17 +705,17 @@ impl VirtualFile { buf = buf.slice(n..); } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return (Slice::into_inner(buf), Err(e)), + Err(e) => return (restore(buf), Err(e)), } } - (Slice::into_inner(buf), Ok(nbytes)) + (restore(buf), Ok(nbytes)) } async fn write( &mut self, - buf: Slice, + buf: FullSlice, ctx: &RequestContext, - ) -> (Slice, Result) { + ) -> (FullSlice, Result) { let pos = self.pos; let (buf, res) = self.write_at(buf, pos, ctx).await; let n = match res { @@ -722,14 +726,14 @@ impl VirtualFile { (buf, Ok(n)) } - pub(crate) async fn read_at( + pub(crate) async fn read_at( &self, - buf: B, + buf: tokio_epoll_uring::Slice, offset: u64, _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ - ) -> (B, Result) + ) -> (tokio_epoll_uring::Slice, Result) where - B: tokio_epoll_uring::BoundedBufMut + Send, + Buf: tokio_epoll_uring::IoBufMut + Send, { let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, @@ -752,12 +756,24 @@ impl VirtualFile { }) } + /// The function aborts the process if the error is fatal. async fn write_at( &self, - buf: Slice, + buf: FullSlice, offset: u64, _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ - ) -> (Slice, Result) { + ) -> (FullSlice, Result) { + let (slice, result) = self.write_at_inner(buf, offset, _ctx).await; + let result = result.maybe_fatal_err("write_at"); + (slice, result) + } + + async fn write_at_inner( + &self, + buf: FullSlice, + offset: u64, + _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ) -> (FullSlice, Result) { let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, Err(e) => return (buf, Err(e)), @@ -781,26 +797,16 @@ impl VirtualFile { } // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 -pub async fn read_exact_at_impl( - buf: B, +pub async fn read_exact_at_impl( + mut buf: tokio_epoll_uring::Slice, mut offset: u64, - count: Option, mut read_at: F, -) -> (B, std::io::Result<()>) +) -> (Buf, std::io::Result<()>) where - B: IoBufMut + Send, - F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, - Fut: std::future::Future, std::io::Result)>, + Buf: IoBufMut + Send, + F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, + Fut: std::future::Future, std::io::Result)>, { - let mut buf: tokio_epoll_uring::Slice = match count { - Some(count) => { - assert!(count <= buf.bytes_total()); - assert!(count > 0); - buf.slice(..count) // may include uninitialized memory - } - None => buf.slice_full(), // includes all the uninitialized memory - }; - while buf.bytes_total() != 0 { let res; (buf, res) = read_at(buf, offset).await; @@ -882,7 +888,7 @@ mod test_read_exact_at_impl { #[tokio::test] async fn test_basic() { - let buf = Vec::with_capacity(5); + let buf = Vec::with_capacity(5).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![Expectation { offset: 0, @@ -890,7 +896,7 @@ mod test_read_exact_at_impl { result: Ok(vec![b'a', b'b', b'c', b'd', b'e']), }]), })); - let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -899,33 +905,13 @@ mod test_read_exact_at_impl { assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']); } - #[tokio::test] - async fn test_with_count() { - let buf = Vec::with_capacity(5); - let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { - expectations: VecDeque::from(vec![Expectation { - offset: 0, - bytes_total: 3, - result: Ok(vec![b'a', b'b', b'c']), - }]), - })); - - let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| { - let mock_read_at = Arc::clone(&mock_read_at); - async move { mock_read_at.lock().await.read_at(buf, offset).await } - }) - .await; - assert!(res.is_ok()); - assert_eq!(buf, vec![b'a', b'b', b'c']); - } - #[tokio::test] async fn test_empty_buf_issues_no_syscall() { - let buf = Vec::new(); + let buf = Vec::new().slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::new(), })); - let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -935,7 +921,7 @@ mod test_read_exact_at_impl { #[tokio::test] async fn test_two_read_at_calls_needed_until_buf_filled() { - let buf = Vec::with_capacity(4); + let buf = Vec::with_capacity(4).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![ Expectation { @@ -950,7 +936,7 @@ mod test_read_exact_at_impl { }, ]), })); - let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -961,7 +947,7 @@ mod test_read_exact_at_impl { #[tokio::test] async fn test_eof_before_buffer_full() { - let buf = Vec::with_capacity(3); + let buf = Vec::with_capacity(3).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![ Expectation { @@ -981,7 +967,7 @@ mod test_read_exact_at_impl { }, ]), })); - let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -1051,27 +1037,29 @@ impl VirtualFile { ctx: &RequestContext, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; - let buf = vec![0; PAGE_SZ]; - let buf = self - .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx) + let slice = Vec::with_capacity(PAGE_SZ).slice_full(); + assert_eq!(slice.bytes_total(), PAGE_SZ); + let slice = self + .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx) .await?; - Ok(crate::tenant::block_io::BlockLease::Vec(buf)) + Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner())) } async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { let mut tmp = vec![0; 128]; loop { - let res; - (tmp, res) = self.read_at(tmp, self.pos, ctx).await; + let slice = tmp.slice(..128); + let (slice, res) = self.read_at(slice, self.pos, ctx).await; match res { Ok(0) => return Ok(()), Ok(n) => { self.pos += n as u64; - buf.extend_from_slice(&tmp[..n]); + buf.extend_from_slice(&slice[..n]); } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } + tmp = slice.into_inner(); } } } @@ -1119,11 +1107,11 @@ impl Drop for VirtualFile { impl OwnedAsyncWriter for VirtualFile { #[inline(always)] - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, ctx: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)> { + ) -> std::io::Result<(usize, FullSlice)> { let (buf, res) = VirtualFile::write_all(self, buf, ctx).await; res.map(move |v| (v, buf)) } @@ -1185,6 +1173,8 @@ mod tests { use crate::task_mgr::TaskKind; use super::*; + use owned_buffers_io::io_buf_ext::IoBufExt; + use owned_buffers_io::slice::SliceMutExt; use rand::seq::SliceRandom; use rand::thread_rng; use rand::Rng; @@ -1206,18 +1196,21 @@ mod tests { impl MaybeVirtualFile { async fn read_exact_at( &self, - mut buf: Vec, + mut slice: tokio_epoll_uring::Slice>, offset: u64, ctx: &RequestContext, - ) -> Result, Error> { + ) -> Result>, Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await, - MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), + MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await, + MaybeVirtualFile::File(file) => { + let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed(); + file.read_exact_at(rust_slice, offset).map(|()| slice) + } } } - async fn write_all_at, Buf: IoBuf + Send>( + async fn write_all_at( &self, - buf: B, + buf: FullSlice, offset: u64, ctx: &RequestContext, ) -> Result<(), Error> { @@ -1226,13 +1219,7 @@ mod tests { let (_buf, res) = file.write_all_at(buf, offset, ctx).await; res } - MaybeVirtualFile::File(file) => { - let buf_len = buf.bytes_init(); - if buf_len == 0 { - return Ok(()); - } - file.write_all_at(&buf.slice(0..buf_len), offset) - } + MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset), } } async fn seek(&mut self, pos: SeekFrom) -> Result { @@ -1241,9 +1228,9 @@ mod tests { MaybeVirtualFile::File(file) => file.seek(pos), } } - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, ctx: &RequestContext, ) -> Result<(), Error> { match self { @@ -1251,13 +1238,7 @@ mod tests { let (_buf, res) = file.write_all(buf, ctx).await; res.map(|_| ()) } - MaybeVirtualFile::File(file) => { - let buf_len = buf.bytes_init(); - if buf_len == 0 { - return Ok(()); - } - file.write_all(&buf.slice(0..buf_len)) - } + MaybeVirtualFile::File(file) => file.write_all(&buf[..]), } } @@ -1286,9 +1267,12 @@ mod tests { len: usize, ctx: &RequestContext, ) -> Result { - let buf = vec![0; len]; - let buf = self.read_exact_at(buf, pos, ctx).await?; - Ok(String::from_utf8(buf).unwrap()) + let slice = Vec::with_capacity(len).slice_full(); + assert_eq!(slice.bytes_total(), len); + let slice = self.read_exact_at(slice, pos, ctx).await?; + let vec = slice.into_inner(); + assert_eq!(vec.len(), len); + Ok(String::from_utf8(vec).unwrap()) } } @@ -1366,7 +1350,9 @@ mod tests { &ctx, ) .await?; - file_a.write_all(b"foobar".to_vec(), &ctx).await?; + file_a + .write_all(b"foobar".to_vec().slice_len(), &ctx) + .await?; // cannot read from a file opened in write-only mode let _ = file_a.read_string(&ctx).await.unwrap_err(); @@ -1375,7 +1361,10 @@ mod tests { let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?; // cannot write to a file opened in read-only mode - let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err(); + let _ = file_a + .write_all(b"bar".to_vec().slice_len(), &ctx) + .await + .unwrap_err(); // Try simple read assert_eq!("foobar", file_a.read_string(&ctx).await?); @@ -1418,8 +1407,12 @@ mod tests { &ctx, ) .await?; - file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?; - file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?; + file_b + .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx) + .await?; + file_b + .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx) + .await?; assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); @@ -1507,7 +1500,11 @@ mod tests { let mut rng = rand::rngs::OsRng; for _ in 1..1000 { let f = &files[rng.gen_range(0..files.len())]; - buf = f.read_exact_at(buf, 0, &ctx).await.unwrap(); + buf = f + .read_exact_at(buf.slice_full(), 0, &ctx) + .await + .unwrap() + .into_inner(); assert!(buf == SAMPLE); } }); diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index 7a27be2ca1..faef1ba9ff 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -12,7 +12,7 @@ #[cfg(target_os = "linux")] pub(super) mod tokio_epoll_uring_ext; -use tokio_epoll_uring::{IoBuf, Slice}; +use tokio_epoll_uring::IoBuf; use tracing::Instrument; pub(crate) use super::api::IoEngineKind; @@ -107,7 +107,10 @@ use std::{ sync::atomic::{AtomicU8, Ordering}, }; -use super::{FileGuard, Metadata}; +use super::{ + owned_buffers_io::{io_buf_ext::FullSlice, slice::SliceMutExt}, + FileGuard, Metadata, +}; #[cfg(target_os = "linux")] fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { @@ -120,38 +123,29 @@ fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std: } impl IoEngine { - pub(super) async fn read_at( + pub(super) async fn read_at( &self, file_guard: FileGuard, offset: u64, - mut buf: B, - ) -> ((FileGuard, B), std::io::Result) + mut slice: tokio_epoll_uring::Slice, + ) -> ( + (FileGuard, tokio_epoll_uring::Slice), + std::io::Result, + ) where - B: tokio_epoll_uring::BoundedBufMut + Send, + Buf: tokio_epoll_uring::IoBufMut + Send, { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { - // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory. - let dst = unsafe { - std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total()) - }; - let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset)); - if let Ok(nbytes) = &res { - assert!(*nbytes <= buf.bytes_total()); - // SAFETY: see above assertion - unsafe { - buf.set_init(*nbytes); - } - } - #[allow(dropping_references)] - drop(dst); - ((file_guard, buf), res) + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + let res = file_guard.with_std_file(|std_file| std_file.read_at(rust_slice, offset)); + ((file_guard, slice), res) } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.read(file_guard, offset, buf).await; + let (resources, res) = system.read(file_guard, offset, slice).await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -215,8 +209,8 @@ impl IoEngine { &self, file_guard: FileGuard, offset: u64, - buf: Slice, - ) -> ((FileGuard, Slice), std::io::Result) { + buf: FullSlice, + ) -> ((FileGuard, FullSlice), std::io::Result) { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { @@ -226,8 +220,12 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.write(file_guard, offset, buf).await; - (resources, res.map_err(epoll_uring_error_to_std)) + let ((file_guard, slice), res) = + system.write(file_guard, offset, buf.into_raw_slice()).await; + ( + (file_guard, FullSlice::must_new(slice)), + res.map_err(epoll_uring_error_to_std), + ) } } } @@ -337,3 +335,29 @@ pub fn feature_test() -> anyhow::Result { .join() .unwrap() } + +/// For use in benchmark binaries only. +/// +/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also +/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste +/// developer time trying to figure out why it's slow. +/// +/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic. +pub fn io_engine_for_bench() -> IoEngineKind { + #[cfg(not(target_os = "linux"))] + { + panic!("This benchmark does I/O and can only give a representative result on Linux"); + } + #[cfg(target_os = "linux")] + { + match feature_test().unwrap() { + FeatureTestResult::PlatformPreferred(engine) => engine, + FeatureTestResult::Worse { + engine: _engine, + remark, + } => { + panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}"); + } + } + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs new file mode 100644 index 0000000000..7c773b6b21 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -0,0 +1,78 @@ +//! See [`FullSlice`]. + +use bytes::{Bytes, BytesMut}; +use std::ops::{Deref, Range}; +use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; + +/// The true owned equivalent for Rust [`slice`]. Use this for the write path. +/// +/// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`, +/// [`FullSlice`] is guaranteed to have all its bytes initialized. This means that +/// [`>::len`] is equal to [`Slice::bytes_init`] and [`Slice::bytes_total`]. +/// +pub struct FullSlice { + slice: Slice, +} + +impl FullSlice +where + B: IoBuf, +{ + pub(crate) fn must_new(slice: Slice) -> Self { + assert_eq!(slice.bytes_init(), slice.bytes_total()); + FullSlice { slice } + } + pub(crate) fn into_raw_slice(self) -> Slice { + let FullSlice { slice: s } = self; + s + } +} + +impl Deref for FullSlice +where + B: IoBuf, +{ + type Target = [u8]; + + fn deref(&self) -> &[u8] { + let rust_slice = &self.slice[..]; + assert_eq!(rust_slice.len(), self.slice.bytes_init()); + assert_eq!(rust_slice.len(), self.slice.bytes_total()); + rust_slice + } +} + +pub(crate) trait IoBufExt { + /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`. + fn slice_len(self) -> FullSlice + where + Self: Sized; +} + +macro_rules! impl_io_buf_ext { + ($T:ty) => { + impl IoBufExt for $T { + #[inline(always)] + fn slice_len(self) -> FullSlice { + let len = self.len(); + let s = if len == 0 { + // `BoundedBuf::slice(0..len)` or `BoundedBuf::slice(..)` has an incorrect assertion, + // causing a panic if len == 0. + // The Slice::from_buf_bounds has the correct assertion (<= instead of <). + // => https://github.com/neondatabase/tokio-epoll-uring/issues/46 + let slice = self.slice_full(); + let mut bounds: Range<_> = slice.bounds(); + bounds.end = bounds.start; + Slice::from_buf_bounds(slice.into_inner(), bounds) + } else { + self.slice(0..len) + }; + FullSlice::must_new(s) + } + } + }; +} + +impl_io_buf_ext!(Bytes); +impl_io_buf_ext!(BytesMut); +impl_io_buf_ext!(Vec); diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs new file mode 100644 index 0000000000..6100593663 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs @@ -0,0 +1,121 @@ +use tokio_epoll_uring::BoundedBuf; +use tokio_epoll_uring::BoundedBufMut; +use tokio_epoll_uring::IoBufMut; +use tokio_epoll_uring::Slice; + +pub(crate) trait SliceMutExt { + /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO. + /// + /// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]` + fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8]; +} + +impl SliceMutExt for Slice +where + B: IoBufMut, +{ + #[inline(always)] + fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8] { + // zero-initialize the uninitialized parts of the buffer so we can create a Rust slice + // + // SAFETY: we own `slice`, don't write outside the bounds + unsafe { + let to_init = self.bytes_total() - self.bytes_init(); + self.stable_mut_ptr() + .add(self.bytes_init()) + .write_bytes(0, to_init); + self.set_init(self.bytes_total()); + }; + let bytes_total = self.bytes_total(); + &mut self[0..bytes_total] + } +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use super::*; + use bytes::Buf; + use tokio_epoll_uring::Slice; + + #[test] + fn test_slice_full_zeroed() { + let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader(); + + // before we start the test, let's make sure we have a shared understanding of what slice_full does + { + let buf = Vec::with_capacity(3); + let slice: Slice<_> = buf.slice_full(); + assert_eq!(slice.bytes_init(), 0); + assert_eq!(slice.bytes_total(), 3); + let rust_slice = &slice[..]; + assert_eq!( + rust_slice.len(), + 0, + "Slice only derefs to a &[u8] of the initialized part" + ); + } + + // and also let's establish a shared understanding of .slice() + { + let buf = Vec::with_capacity(3); + let slice: Slice<_> = buf.slice(0..2); + assert_eq!(slice.bytes_init(), 0); + assert_eq!(slice.bytes_total(), 2); + let rust_slice = &slice[..]; + assert_eq!( + rust_slice.len(), + 0, + "Slice only derefs to a &[u8] of the initialized part" + ); + } + + // the above leads to the easy mistake of using slice[..] for borrow-based IO like so: + { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice_full(); + assert_eq!(slice[..].len(), 0); + let mut file = make_fake_file(); + file.read_exact(&mut slice[..]).unwrap(); // one might think this reads 3 bytes but it reads 0 + assert_eq!(&slice[..] as &[u8], &[][..] as &[u8]); + } + + // With owned buffers IO like with VirtualFilem, you could totally + // pass in a `Slice` with bytes_init()=0 but bytes_total()=5 + // and it will read 5 bytes into the slice, and return a slice that has bytes_init()=5. + { + // TODO: demo + } + + // + // Ok, now that we have a shared understanding let's demo how to use the extension trait. + // + + // slice_full() + { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice_full(); + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + assert_eq!(rust_slice.len(), 3); + assert_eq!(rust_slice, &[0, 0, 0]); + let mut file = make_fake_file(); + file.read_exact(rust_slice).unwrap(); + assert_eq!(rust_slice, b"123"); + assert_eq!(&slice[..], b"123"); + } + + // .slice(..) + { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice(0..2); + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + assert_eq!(rust_slice.len(), 2); + assert_eq!(rust_slice, &[0, 0]); + let mut file = make_fake_file(); + file.read_exact(rust_slice).unwrap(); + assert_eq!(rust_slice, b"12"); + assert_eq!(&slice[..], b"12"); + } + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs index 55b1d0b46b..efcb61ba65 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs @@ -1,5 +1,8 @@ -use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter}; -use tokio_epoll_uring::{BoundedBuf, IoBuf}; +use crate::{ + context::RequestContext, + virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter}, +}; +use tokio_epoll_uring::IoBuf; pub struct Writer { dst: W, @@ -35,11 +38,11 @@ where W: OwnedAsyncWriter, { #[inline(always)] - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, ctx: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)> { + ) -> std::io::Result<(usize, FullSlice)> { let (nwritten, buf) = self.dst.write_all(buf, ctx).await?; self.bytes_amount += u64::try_from(nwritten).unwrap(); Ok((nwritten, buf)) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 885a9221c5..f8f37b17e3 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,16 +1,18 @@ use bytes::BytesMut; -use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use tokio_epoll_uring::IoBuf; use crate::context::RequestContext; +use super::io_buf_ext::{FullSlice, IoBufExt}; + /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. pub trait OwnedAsyncWriter { - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, ctx: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)>; + ) -> std::io::Result<(usize, FullSlice)>; } /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch @@ -33,6 +35,7 @@ pub struct BufferedWriter { /// invariant: always remains Some(buf) except /// - while IO is ongoing => goes back to Some() once the IO completed successfully /// - after an IO error => stays `None` forever + /// /// In these exceptional cases, it's `None`. buf: Option, } @@ -78,9 +81,11 @@ where #[cfg_attr(target_os = "macos", allow(dead_code))] pub async fn write_buffered( &mut self, - chunk: Slice, + chunk: FullSlice, ctx: &RequestContext, - ) -> std::io::Result<(usize, S)> { + ) -> std::io::Result<(usize, FullSlice)> { + let chunk = chunk.into_raw_slice(); + let chunk_len = chunk.len(); // avoid memcpy for the middle of the chunk if chunk.len() >= self.buf().cap() { @@ -93,7 +98,10 @@ where .pending(), 0 ); - let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?; + let (nwritten, chunk) = self + .writer + .write_all(FullSlice::must_new(chunk), ctx) + .await?; assert_eq!(nwritten, chunk_len); return Ok((nwritten, chunk)); } @@ -113,7 +121,7 @@ where } } assert!(slice.is_empty(), "by now we should have drained the chunk"); - Ok((chunk_len, chunk.into_inner())) + Ok((chunk_len, FullSlice::must_new(chunk))) } /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data. @@ -149,9 +157,12 @@ where self.buf = Some(buf); return Ok(()); } - let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?; + let slice = buf.flush(); + let (nwritten, slice) = self.writer.write_all(slice, ctx).await?; assert_eq!(nwritten, buf_len); - self.buf = Some(Buffer::reuse_after_flush(io_buf)); + self.buf = Some(Buffer::reuse_after_flush( + slice.into_raw_slice().into_inner(), + )); Ok(()) } } @@ -171,9 +182,9 @@ pub trait Buffer { /// Number of bytes in the buffer. fn pending(&self) -> usize; - /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data + /// Turns `self` into a [`FullSlice`] of the pending data /// so we can use [`tokio_epoll_uring`] to write it to disk. - fn flush(self) -> Slice; + fn flush(self) -> FullSlice; /// After the write to disk is done and we have gotten back the slice, /// [`BufferedWriter`] uses this method to re-use the io buffer. @@ -197,12 +208,8 @@ impl Buffer for BytesMut { self.len() } - fn flush(self) -> Slice { - if self.is_empty() { - return self.slice_full(); - } - let len = self.len(); - self.slice(0..len) + fn flush(self) -> FullSlice { + self.slice_len() } fn reuse_after_flush(mut iobuf: BytesMut) -> Self { @@ -212,18 +219,13 @@ impl Buffer for BytesMut { } impl OwnedAsyncWriter for Vec { - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, _: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)> { - let nbytes = buf.bytes_init(); - if nbytes == 0 { - return Ok((0, Slice::into_inner(buf.slice_full()))); - } - let buf = buf.slice(0..nbytes); + ) -> std::io::Result<(usize, FullSlice)> { self.extend_from_slice(&buf[..]); - Ok((buf.len(), Slice::into_inner(buf))) + Ok((buf.len(), buf)) } } @@ -240,19 +242,13 @@ mod tests { writes: Vec>, } impl OwnedAsyncWriter for RecorderWriter { - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, _: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)> { - let nbytes = buf.bytes_init(); - if nbytes == 0 { - self.writes.push(vec![]); - return Ok((0, Slice::into_inner(buf.slice_full()))); - } - let buf = buf.slice(0..nbytes); + ) -> std::io::Result<(usize, FullSlice)> { self.writes.push(Vec::from(&buf[..])); - Ok((buf.len(), Slice::into_inner(buf))) + Ok((buf.len(), buf)) } } @@ -263,7 +259,7 @@ mod tests { macro_rules! write { ($writer:ident, $data:literal) => {{ $writer - .write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx()) + .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx()) .await?; }}; } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 4f26f2f6d1..8425528740 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -343,7 +343,33 @@ impl WalIngest { xlog_checkpoint.oldestActiveXid, self.checkpoint.oldestActiveXid ); - self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid; + + // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, + // because at shutdown, all in-progress transactions will implicitly + // end. Postgres startup code knows that, and allows hot standby to start + // immediately from a shutdown checkpoint. + // + // In Neon, Postgres hot standby startup always behaves as if starting from + // an online checkpoint. It needs a valid `oldestActiveXid` value, so + // instead of overwriting self.checkpoint.oldestActiveXid with + // InvalidTransactionid from the checkpoint WAL record, update it to a + // proper value, knowing that there are no in-progress transactions at this + // point, except for prepared transactions. + // + // See also the neon code changes in the InitWalRecovery() function. + if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID + && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN + { + let mut oldest_active_xid = self.checkpoint.nextXid.value as u32; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { + oldest_active_xid = xid; + } + } + self.checkpoint.oldestActiveXid = oldest_active_xid; + } else { + self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid; + } // Write a new checkpoint key-value pair on every checkpoint record, even // if nothing really changed. Not strictly required, but it seems nice to @@ -375,6 +401,7 @@ impl WalIngest { if info == pg_constants::XLOG_RUNNING_XACTS { let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf); self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid; + self.checkpoint_modified = true; } } pg_constants::RM_REPLORIGIN_ID => { @@ -488,7 +515,7 @@ impl WalIngest { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)? + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version) // do not materialize null pages because them most likely be soon replaced with real data && blk.bimg_len != 0 { @@ -591,7 +618,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -658,7 +685,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -725,7 +752,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -893,7 +920,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -1277,13 +1304,10 @@ impl WalIngest { xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db ); - // Here we treat oldestXid and oldestXidDB - // differently from postgres redo routines. - // In postgres checkpoint.oldestXid lags behind xlrec.oldest_xid - // until checkpoint happens and updates the value. - // Here we can use the most recent value. - // It's just an optimization, though and can be deleted. - // TODO Figure out if there will be any issues with replica. + // In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is + // truncated, but a checkpoint record with the updated values isn't written until + // later. In Neon, a server can start at any LSN, not just on a checkpoint record, + // so we keep the oldestXid and oldestXidDB up-to-date. self.checkpoint.oldestXid = xlrec.oldest_xid; self.checkpoint.oldestXidDB = xlrec.oldest_xid_db; self.checkpoint_modified = true; @@ -1384,14 +1408,31 @@ impl WalIngest { // Note: The multixact members can wrap around, even within one WAL record. offset = offset.wrapping_add(n_this_page as u32); } - if xlrec.mid >= self.checkpoint.nextMulti { - self.checkpoint.nextMulti = xlrec.mid + 1; - self.checkpoint_modified = true; - } - if xlrec.moff + xlrec.nmembers > self.checkpoint.nextMultiOffset { - self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers; + let next_offset = offset; + assert!(xlrec.moff.wrapping_add(xlrec.nmembers) == next_offset); + + // Update next-multi-xid and next-offset + // + // NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to + // go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that + // read it, like GetNewMultiXactId(). This is different from how nextXid is + // incremented! nextXid skips over < FirstNormalTransactionId when the the value + // is stored, so it's never 0 in a checkpoint. + // + // I don't know why it's done that way, it seems less error-prone to skip over 0 + // when the value is stored rather than when it's read. But let's do it the same + // way here. + let next_multi_xid = xlrec.mid.wrapping_add(1); + + if self + .checkpoint + .update_next_multixid(next_multi_xid, next_offset) + { self.checkpoint_modified = true; } + + // Also update the next-xid with the highest member. According to the comments in + // multixact_redo(), this shouldn't be necessary, but let's do the same here. let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| { if let Some(max_xid) = acc { if mbr.xid.wrapping_sub(max_xid) as i32 > 0 { @@ -1661,7 +1702,7 @@ async fn get_relsize( modification: &DatadirModification<'_>, rel: RelTag, ctx: &RequestContext, -) -> anyhow::Result { +) -> Result { let nblocks = if !modification .tline .get_rel_exists(rel, Version::Modified(modification), ctx) @@ -1713,7 +1754,7 @@ mod tests { #[tokio::test] async fn test_relsize() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1934,7 +1975,10 @@ mod tests { // and then created it again within the same layer. #[tokio::test] async fn test_drop_extend() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_drop_extend") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -2005,7 +2049,10 @@ mod tests { // and then extended it again within the same layer. #[tokio::test] async fn test_truncate_extend() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_truncate_extend") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -2147,7 +2194,7 @@ mod tests { /// split into multiple 1 GB segments in Postgres. #[tokio::test] async fn test_large_rel() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_large_rel").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -2255,7 +2302,7 @@ mod tests { let startpoint = Lsn::from_hex("14AEC08").unwrap(); let _endpoint = Lsn::from_hex("1FFFF98").unwrap(); - let harness = TenantHarness::create("test_ingest_real_wal").unwrap(); + let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap(); let (tenant, ctx) = harness.load().await; let remote_initdb_path = diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 62a3a91b0b..edddcefbe1 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -1018,7 +1018,7 @@ pub fn decode_wal_record( ); let blk_img_is_compressed = - postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?; + postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version); if blk_img_is_compressed { debug!("compressed block image , pg_version = {}", pg_version); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index d562540bde..82585f9ed8 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -40,6 +40,7 @@ use std::time::Duration; use std::time::Instant; use tracing::*; use utils::lsn::Lsn; +use utils::sync::gate::GateError; use utils::sync::heavier_once_cell; /// @@ -53,10 +54,18 @@ pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, - /// The current [`process::WalRedoProcess`] that is used by new redo requests. - /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo - /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the + /// We use [`heavier_once_cell`] for + /// + /// 1. coalescing the lazy spawning of walredo processes ([`ProcessOnceCell::Spawned`]) + /// 2. prevent new processes from being spawned on [`Self::shutdown`] (=> [`ProcessOnceCell::ManagerShutDown`]). + /// + /// # Spawning + /// + /// Redo requests use the once cell to coalesce onto one call to [`process::WalRedoProcess::launch`]. + /// + /// Notably, requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the /// their process object; we use [`Arc::clone`] for that. + /// /// This is primarily because earlier implementations that didn't use [`heavier_once_cell`] /// had that behavior; it's probably unnecessary. /// The only merit of it is that if one walredo process encounters an error, @@ -65,7 +74,65 @@ pub struct PostgresRedoManager { /// still be using the old redo process. But, those other tasks will most likely /// encounter an error as well, and errors are an unexpected condition anyway. /// So, probably we could get rid of the `Arc` in the future. - redo_process: heavier_once_cell::OnceCell>, + /// + /// # Shutdown + /// + /// See [`Self::launched_processes`]. + redo_process: heavier_once_cell::OnceCell, + + /// Gate that is entered when launching a walredo process and held open + /// until the process has been `kill()`ed and `wait()`ed upon. + /// + /// Manager shutdown waits for this gate to close after setting the + /// [`ProcessOnceCell::ManagerShutDown`] state in [`Self::redo_process`]. + /// + /// This type of usage is a bit unusual because gates usually keep track of + /// concurrent operations, e.g., every [`Self::request_redo`] that is inflight. + /// But we use it here to keep track of the _processes_ that we have launched, + /// which may outlive any individual redo request because + /// - we keep walredo process around until its quiesced to amortize spawn cost and + /// - the Arc may be held by multiple concurrent redo requests, so, just because + /// you replace the [`Self::redo_process`] cell's content doesn't mean the + /// process gets killed immediately. + /// + /// We could simplify this by getting rid of the [`Arc`]. + /// See the comment on [`Self::redo_process`] for more details. + launched_processes: utils::sync::gate::Gate, +} + +/// See [`PostgresRedoManager::redo_process`]. +enum ProcessOnceCell { + Spawned(Arc), + ManagerShutDown, +} + +struct Process { + process: process::WalRedoProcess, + /// This field is last in this struct so the guard gets dropped _after_ [`Self::process`]. + /// (Reminder: dropping [`Self::process`] synchronously sends SIGKILL and then `wait()`s for it to exit). + _launched_processes_guard: utils::sync::gate::GateGuard, +} + +impl std::ops::Deref for Process { + type Target = process::WalRedoProcess; + + fn deref(&self) -> &Self::Target { + &self.process + } +} + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("cancelled")] + Cancelled, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +macro_rules! bail { + ($($arg:tt)*) => { + return Err($crate::walredo::Error::Other(::anyhow::anyhow!($($arg)*))); + } } /// @@ -88,9 +155,9 @@ impl PostgresRedoManager { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: u32, - ) -> anyhow::Result { + ) -> Result { if records.is_empty() { - anyhow::bail!("invalid WAL redo request with no records"); + bail!("invalid WAL redo request with no records"); } let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID); @@ -148,10 +215,10 @@ impl PostgresRedoManager { chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) }) }, - process: self - .redo_process - .get() - .map(|p| WalRedoManagerProcessStatus { pid: p.id() }), + process: self.redo_process.get().and_then(|p| match &*p { + ProcessOnceCell::Spawned(p) => Some(WalRedoManagerProcessStatus { pid: p.id() }), + ProcessOnceCell::ManagerShutDown => None, + }), } } } @@ -170,9 +237,52 @@ impl PostgresRedoManager { conf, last_redo_at: std::sync::Mutex::default(), redo_process: heavier_once_cell::OnceCell::default(), + launched_processes: utils::sync::gate::Gate::default(), } } + /// Shut down the WAL redo manager. + /// + /// Returns `true` if this call was the one that initiated shutdown. + /// `true` may be observed by no caller if the first caller stops polling. + /// + /// After this future completes + /// - no redo process is running + /// - no new redo process will be spawned + /// - redo requests that need walredo process will fail with [`Error::Cancelled`] + /// - [`apply_neon`]-only redo requests may still work, but this may change in the future + /// + /// # Cancel-Safety + /// + /// This method is cancellation-safe. + pub async fn shutdown(&self) -> bool { + // prevent new processes from being spawned + let maybe_permit = match self.redo_process.get_or_init_detached().await { + Ok(guard) => { + if matches!(&*guard, ProcessOnceCell::ManagerShutDown) { + None + } else { + let (proc, permit) = guard.take_and_deinit(); + drop(proc); // this just drops the Arc, its refcount may not be zero yet + Some(permit) + } + } + Err(permit) => Some(permit), + }; + let it_was_us = if let Some(permit) = maybe_permit { + self.redo_process + .set(ProcessOnceCell::ManagerShutDown, permit); + true + } else { + false + }; + // wait for ongoing requests to drain and the refcounts of all Arc that + // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s + // for the underlying process. + self.launched_processes.close().await; + it_was_us + } + /// This type doesn't have its own background task to check for idleness: we /// rely on our owner calling this function periodically in its own housekeeping /// loops. @@ -203,38 +313,51 @@ impl PostgresRedoManager { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, pg_version: u32, - ) -> anyhow::Result { + ) -> Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); let (rel, blknum) = key.to_rel_block().context("invalid record")?; const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - let proc: Arc = - match self.redo_process.get_or_init_detached().await { - Ok(guard) => Arc::clone(&guard), - Err(permit) => { - // don't hold poison_guard, the launch code can bail - let start = Instant::now(); - let proc = Arc::new( - process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - ); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - self.redo_process.set(Arc::clone(&proc), permit); - proc + let proc: Arc = match self.redo_process.get_or_init_detached().await { + Ok(guard) => match &*guard { + ProcessOnceCell::Spawned(proc) => Arc::clone(proc), + ProcessOnceCell::ManagerShutDown => { + return Err(Error::Cancelled); } - }; + }, + Err(permit) => { + let start = Instant::now(); + // acquire guard before spawning process, so that we don't spawn new processes + // if the gate is already closed. + let _launched_processes_guard = match self.launched_processes.enter() { + Ok(guard) => guard, + Err(GateError::GateClosed) => unreachable!( + "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" + ), + }; + let proc = Arc::new(Process { + process: process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, + _launched_processes_guard, + }); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process + .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit); + proc + } + }; let started_at = std::time::Instant::now(); @@ -299,12 +422,17 @@ impl PostgresRedoManager { match self.redo_process.get() { None => (), Some(guard) => { - if Arc::ptr_eq(&proc, &*guard) { - // We're the first to observe an error from `proc`, it's our job to take it out of rotation. - guard.take_and_deinit(); - } else { - // Another task already spawned another redo process (further up in this method) - // and put it into `redo_process`. Do nothing, our view of the world is behind. + match &*guard { + ProcessOnceCell::ManagerShutDown => {} + ProcessOnceCell::Spawned(guard_proc) => { + if Arc::ptr_eq(&proc, guard_proc) { + // We're the first to observe an error from `proc`, it's our job to take it out of rotation. + guard.take_and_deinit(); + } else { + // Another task already spawned another redo process (further up in this method) + // and put it into `redo_process`. Do nothing, our view of the world is behind. + } + } } } } @@ -315,7 +443,7 @@ impl PostgresRedoManager { } n_attempts += 1; if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() { - return result; + return result.map_err(Error::Other); } } } @@ -329,7 +457,7 @@ impl PostgresRedoManager { lsn: Lsn, base_img: Option, records: &[(Lsn, NeonWalRecord)], - ) -> anyhow::Result { + ) -> Result { let start_time = Instant::now(); let mut page = BytesMut::new(); @@ -338,7 +466,7 @@ impl PostgresRedoManager { page.extend_from_slice(&fpi[..]); } else { // All the current WAL record types that we can handle require a base image. - anyhow::bail!("invalid neon WAL redo request with no base image"); + bail!("invalid neon WAL redo request with no base image"); } // Apply all the WAL records in the batch diff --git a/pageserver/test_data/indices/mixed_workload/README.md b/pageserver/test_data/indices/mixed_workload/README.md new file mode 100644 index 0000000000..724274fcd9 --- /dev/null +++ b/pageserver/test_data/indices/mixed_workload/README.md @@ -0,0 +1,7 @@ + +# This was captured from one shard of a large tenant in staging. + +# It has a mixture of deltas and image layers, >1000 layers in total. + +# This is suitable for general smoke tests that want an index which is not +# trivially small, but doesn't contain weird/pathological cases. diff --git a/pageserver/test_data/indices/mixed_workload/index_part.json b/pageserver/test_data/indices/mixed_workload/index_part.json new file mode 100644 index 0000000000..cb4bfc4726 --- /dev/null +++ b/pageserver/test_data/indices/mixed_workload/index_part.json @@ -0,0 +1 @@ +{"version":7,"layer_metadata":{"000000067F00004005000060F300069883DB-000000067F00004005000060F300069D13FA__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A4000-000000067F00004005000060F300039C0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039FC000-000000067F00004005000060F30003A0F066__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000082C0F1-000000067F000040050081DB43000086E169__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000478000-000000067F00004005000060F3000047C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000012C000-000000067F00004005000060F300001F0000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F700019E8000-000000067F00004005000060F700019EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018E0FE6-000000067F00004005000060F3000193A10B__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016E85370000004000-030000000000000000000000000000000002__0000018613F0A050":{"file_size":14172160,"generation":3,"shard":"0008"},"000000067F00004005000060F300034847BD-000000067F00004005000060F300034BD86C__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C80000-000000067F000040050081DB430000C84000__000000BDAFECFC00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000CCBA0-000000067F00004005000060F20100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA4000-000000067F00004005016EA00C0000CE0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00013BC000-000000067F00004005000060FB0001400000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001240000-000000067F00004005016EA00C0001244000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004EC52E9-000000067F00004005000060F30004F1638A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E10000-000000067F000040050081DB430000E14000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000007F0F-000000067F0000400500EB4A480000037E20__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004FE8000-000000067F00004005000060F3000502905D__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000072C000-000000067F000040050081DB430000768000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E3B48F-000000067F00004005000060F30005EF454F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A100000B7E04-030000000000000000000000000000000002__000000E7C2F1B249-000000EBC9213D59":{"file_size":30146560,"generation":2,"shard":"0008"},"000000067F0000400501025D90000009029B-000000067F0000400501025D950100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A10000-000000067F000040050081DB430000A14000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F5105E-000000067F00004005000060F30002F9A0EB__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000187FE22-000000067F000040050081D80C0100000000__00000075E5D2A930":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001E8000-000000067F000040050081DB4300001EC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000184C000-000000067F00004005000060FB000187FE22__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A16504-000000067F00004005000060F30005A57691__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C0000-000000067F00004005000060F100005C821A__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__000001BCB572A4E0":{"file_size":2310144,"generation":17,"shard":"0008"},"000000067F00004005000060F30002214000-000000067F00004005000060F30002264247__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000110000-000000067F0000400500E3A2A10000114000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006864000-000000067F00004005000060F30006868000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D0000-000000067F0000400500DBCED500000D4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274C000-000000067F00004005000060F30002790000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009274AB-030000000000000000000000000000000002__000001935283F9B9-00000196C9018F59":{"file_size":60104704,"generation":11,"shard":"0008"},"000000067F0000400500C782E4000023D359-000000067F0000400500C782E400002A5E4B__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001780DB7-000000067F00004005000060F700017E1391__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E4000-000000067F000040050081DB4300004F8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C0000-000000067F00004005016EA00C00018C4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300056DC000-000000067F00004005000060F300056E0000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001F14230-000000067F000040050081D80C0100000000__0000018613F0A050":{"file_size":59138048,"generation":3,"shard":"0008"},"000000067F00004005010F9F120000004000-030000000000000000000000000000000002__0000012E77D3BF00":{"file_size":105775104,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D80000-000000067F00004005000060F30002D84000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000122BBF-000000067F00004005000060F7000013B18E__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B10000-000000067F00004005000060F30002B88FF2__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006320C60-000000067F00004005000060F30006349DA2__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079E393-000000067F00004005016EA00C00009BF728__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500F67839000005C000-000000067F0000400500F67839000006AEF4__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7F71A-030000000000000000000000000000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":50880512,"generation":17,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C481-000001BCB572C5D9":{"file_size":24576,"generation":20,"shard":"0008"},"000000067F00004005000060F70001570000-000000067F00004005000060F70001574000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000042C000-000000067F00004005000060F30000478000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C5D9-000001BCB572DFF9":{"file_size":24576,"generation":22,"shard":"0008"},"000000067F00004005000060FB00015FCD31-030000000000000000000000000000000002__000000698F2C3A38":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C841ED-000000067F00004005000060F30005C95225__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B4A119-000000067F00004005000060F30100000000__0000008196C976A1-0000008625CF2891":{"file_size":200990720,"generation":2,"shard":"0008"},"000000067F00004005000060F300019790A2-000000067F00004005000060F300019C2056__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001838000-000000067F00004005000060FB000183C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C00FE1-000000067F00004005000060F30001C0A0A3__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E0000-000000067F00004005000060F300056E4000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BBD532-000000067F00004005000060F80100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":96477184,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F9B026-000000067F00004005000060F30100000000__00000047E31D98D1-0000004C49155071":{"file_size":173834240,"generation":2,"shard":"0008"},"000000067F000040050081DB430000500000-000000067F000040050081DB430000504000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004971675-000000067F00004005000060F300049B26A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003102107-000000067F00004005000060F300031130BC__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A4000-000000067F00004005000060F30004900000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004B8000-000000067F00004005016EA00C00004BC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001A71688-000000067F00004005000060FB0001A8A1CD__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E60000-000000067F00004005000060F30000E64000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023B0FF7-000000067F00004005000060F300024020ED__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003F8000-000000067F00004005016EA00C00003FC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004B2B250-000000067F00004005000060F30004B5431C__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000050000-000000067F00004005000060F700000885C5__000000044854EBD1-00000008B6B51879":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000097168A-030000000000000000000000000000000002__00000028C365FBE1-0000002D2A8E0B81":{"file_size":120299520,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625C000-000000067F00004005000060F30006270000__0000017171761D90":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BA8000-000000067F00004005000060FB0001BC0B44__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003344134-000000067F00004005000060F3000336D193__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B10FFF-000000067F00004005000060F30006B22072__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E34000-000000067F00004005000060F30006E70000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008238C-000000067F00004005000060F60100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A30000-000000067F00004005000060F70100000000__0000009DF02C1241-000000A173C00489":{"file_size":269688832,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001CE16ED-000000067F000040050081D80C0100000000__0000008DDCD70B68":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B0000-000000067F000040050081DB4300011B4000__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000010C0D1-000000067F0000400500F3A25C000011E137__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000004000-000000067F00004005000060F70000029ED0__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F60000058F73-000000067F00004005000060F60100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C3F636-000000067F00004005016EA00C0001CC74D7__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000101089-000000067F0000400500EB4A48000012798C__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007A8000-000000067F000040050081DB4300007AC000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000010043F-000000067F00004005000060F20100000000__0000000D55A212C9-000000114A805939":{"file_size":182878208,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EAC000-000000067F00004005000060FB0001F14230__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000616F6B2-000000067F00004005000060F300061B8705__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C9E3C4-000000067F00004005000060F30005CCF3C5__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AA0000-000000067F00004005000060F70001AB05CB__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000073C000-000000067F00004005000060F30000775A02__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AE21D-000000067F000040050081DB43000045029C__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B04000-000000067F00004005000060F70001B18000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E74000-000000067F00004005000060F30000E78000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000182C000-000000067F00004005000060F700018871D6__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DE8B45-000000067F00004005000060FB0000DF968A__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E78000-000000067F00004005000060F30000E7C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000140C000-030000000000000000000000000000000002__000000603CA8F2F0":{"file_size":89522176,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011CA1CD-000000067F00004005000060FB00011F2D11__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144FB4E-000000067F00004005016EA00C00014B79E7__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700015A195C-000000067F00004005000060F80100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC0000-000000067F00004005000060F70000FC4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000012798C-000000067F0000400500EB4A48000013F89B__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE4000-000000067F00004005016EA00C0001D18000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30005FC519A-000000067F00004005000060F30005FE621A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000370000-000000067F00004005016EA00C0000374000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001760000-000000067F00004005016EA00C0001764000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100003A0000-000000067F00004005000060F100003B8214__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B0000-000000067F00004005000060F300006B4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004E1FF6-030000000000000000000000000000000002__000000174479FC18":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F3000502905D-000000067F00004005000060F300050321C0__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB05CB-000000067F00004005000060F70001AB8B97__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000151F7C5-000000067F00004005016EA00C000158F667__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B9C000-000000067F00004005000060F80100000000__000000AFE87558B0":{"file_size":83533824,"generation":2,"shard":"0008"},"000000067F00004005000060F7000141882A-000000067F00004005000060F80100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000018F5CD-000000067F0000400500EB4A48000019F4DD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000196C000-000000067F00004005000060F70001990000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300029C623C-000000067F00004005000060F30100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":81313792,"generation":2,"shard":"0008"},"000000067F00004005000060F300027C0000-000000067F00004005000060F300027C4000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000001487-000000067F0000400500FB3D300100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":24428544,"generation":2,"shard":"0008"},"000000067F00004005000060F300056D8000-000000067F00004005000060F300056DC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C0000-000000067F00004005000060F700003C4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000664E3CA-000000067F00004005000060F30100000000__000001715E483C79-000001751A7D7589":{"file_size":288645120,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004B5AD-000000067F000040050100D04D00000634BB__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000002C000-000000067F0000400500DBCED50000078000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C20000-000000067F00004005016EA00C0000C24000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001B30000-000000067F00004005000060F70001B34000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700009C035C-000000067F00004005000060F80100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":264159232,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B33945-000000067F00004005000060F30100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":155344896,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079FCFA-000000067F00004005016EA00C00007C7B9C__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000218000-000000067F0000400500EB4A48000021C000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1D0DC-000000067F00004005000060F30005D76250__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000149B774-000000067F00004005000060FB00014A42B8__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D0B155-000000067F00004005000060F30003D14206__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300020FC052-000000067F00004005000060F300021050B0__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002268000-000000067F00004005000060F300022B9050__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004FC000-000000067F000040050081DB430000500000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A93B5-000000067F00004005000060F300060C2210__0000016834A3FC91-0000016B49A934C1":{"file_size":263479296,"generation":2,"shard":"0008"},"000000067F00004005000060F3000674C000-000000067F00004005000060F30006798000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007F913A-030000000000000000000000000000000002__000000A5A3F27398":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F4000-030000000000000000000000000000000002__000000E4D847F4E0":{"file_size":103907328,"generation":2,"shard":"0008"},"000000067F00004005000060F70001348000-000000067F00004005000060F70100000000__0000011B632CC319-0000011F1A40FA69":{"file_size":270753792,"generation":2,"shard":"0008"},"000000067F00004005000060F10000030000-000000067F00004005000060F20100000000__000000021DC73119-000000044854EBD1":{"file_size":267771904,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006674000-000000067F00004005000060F30006690000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000298000-000000067F00004005000060F3000029C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F185D4-000000067F00004005000060F80100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":249135104,"generation":2,"shard":"0008"},"000000067F00004005000060F300049CB712-000000067F00004005000060F30004A048A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700004B1E77-000000067F00004005000060F80100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B00000-000000067F00004005000060F30004B1111A__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D14000-000000067F00004005000060F30006D30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00002D77AE-030000000000000000000000000000000002__000001880F984A29-0000018C496B6DB1":{"file_size":81018880,"generation":11,"shard":"0008"},"000000067F00004005000060F300002D0000-000000067F00004005000060F30000370FD1__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000028000-000000067F0000400500D69D79000002C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002170000-000000067F00004005000060F30002174000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F59017-000000067F00004005000060F30000F91FFF__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006A37A-000000067F00004005000060F60100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002F012-000000067F00004005000060F60100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005614000-000000067F00004005000060F30005688000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300036C8000-000000067F00004005000060F300036F91FE__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF63C-030000000000000000000000000000000002__000001B3E1B95181-000001B6FFE46BC9":{"file_size":64421888,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000057D31-000000067F0000400500EB4A48000008FC41__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F58000-000000067F00004005016EA00C0000F5C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000908000-000000067F000040050081DB43000094A076__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000471200E-000000067F00004005000060F3000474302B__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000403DA-030000000000000000000000000000000002__00000075E5D2A930":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F60000079C4E-000000067F00004005000060F60100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F67839000003C000-000000067F0000400500F678390000058000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C80000-000000067F00004005000060FB0001C84000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300047F5138-000000067F00004005000060F3000480620C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B5C09E-000000067F00004005000060F30006BAD108__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410F57-000000067F00004005000060F70001429534__00000122A7BB7B29-0000012694E36301":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B4000-000000067F00004005016EA00C00006E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009605D8-000000067F00004005000060F80100000000__000000923719A971-00000096262826C9":{"file_size":251338752,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C8CD0C-000000067F00004005000060F80100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700012B8000-000000067F00004005000060F80100000000__00000113456156F1-00000117EDA82C11":{"file_size":265781248,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000049C000-000000067F00004005016EA00C00004A8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000C78000-000000067F00004005000060F70000C7C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B4B0BB-000000067F00004005000060F30006B5C09E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001844000-000000067F00004005000060FB0001848000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F0000-000000067F00004005000060F300067F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C80000-000000067F00004005000060F30004C84000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A4C000-000000067F00004005000060F30002A98000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002480000-000000067F00004005000060F30002484000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000306A02D-000000067F00004005000060F30100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":191299584,"generation":2,"shard":"0008"},"000000067F00004005000060F70001510000-000000067F00004005000060F70001514000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BDB15B-000000067F00004005000060F30005C841ED__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E98000-000000067F00004005000060FB0001E9C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300057942F4-000000067F00004005000060F300057DD292__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005698000-000000067F00004005000060F3000569C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002983166-000000067F00004005000060F3000299C28F__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C24000-000000067F00004005016EA00C0000CA0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300033D7D7C-000000067F00004005000060F30003458D42__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A1C000-000000067F000040050081DB430000A30379__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D93639-000000067F00004005000060F50100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C195-000000067F00004005016EA00C000029C196__000001BA93C39481-000001BCB572A4E1":{"file_size":32768,"generation":17,"shard":"0008"},"000000067F00004005000060F30000A5F9BB-000000067F00004005000060F60100000000__000000321AA80270":{"file_size":81657856,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D84000-000000067F00004005000060F30002D93639__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1C000-000000067F00004005000060F30005D70000__000001684518AF20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010C8000-000000067F000040050081DB4300010E2072__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000058AF5E-000000067F000040050081DB4300005BCFD7__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000034611E-000000067F00004005000060F80100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300000C1095-000000067F00004005000060F60100000000__000000021DC73119-000000044854EBD1":{"file_size":220635136,"generation":2,"shard":"0008"},"000000067F00004005000060FB000183C000-000000067F00004005000060FB0001840000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C8729E-000000067F00004005000060F30006C98340__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005138000-000000067F00004005000060F3000513C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053E30C3-000000067F00004005000060F300053F40CC__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000002C000-000000067F000040050081DB4300000403DA__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004970000-000000067F00004005000060F30004974000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C08000-000000067F00004005000060F30003C0C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000103AD12-000000067F00004005000060FB000104B856__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004AC000-000000067F00004005016EA00C00004B8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000DB7D33-000000067F00004005016EA00C0000E47BD2__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001F30000-000000067F00004005000060F30001F34000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C000-030000000000000000000000000000000002__000001180B3FF408":{"file_size":70516736,"generation":2,"shard":"0008"},"000000067F00004005000060F700017405D4-000000067F00004005000060F70001758B92__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300030B0000-000000067F00004005000060F300030C0FE5__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002168000-000000067F00004005000060F3000216C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000046A83-000000067F00004005000060F60100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001368000-000000067F00004005000060FB000136C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000184000-000000067F00004005000060F80100000000__000000174479FC18":{"file_size":93143040,"generation":2,"shard":"0008"},"000000067F00004005000060FB00012A8000-000000067F00004005000060FB0100000000__00000057593D8169-0000005C01565329":{"file_size":273711104,"generation":2,"shard":"0008"},"000000067F00004005000060F700007B0000-000000067F00004005000060F700007D05C8__00000075CC373F31-00000079F2A2F311":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001680B45-000000067F00004005000060FB000169968A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300050CC000-000000067F00004005000060F300050E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__0000018613F0A050":{"file_size":2310144,"generation":3,"shard":"0008"},"000000067F00004005000060F70001B1C000-000000067F00004005000060F70001B30000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F50000-000000067F00004005000060F70000F705D6__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050109CD330100000000-000000067F000040050109FFA2000000C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001FC000-000000067F0000400500EB4A480000200000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000240B12A-000000067F00004005000060F300024440AE__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000008228D-000000067F00004005000060F60100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C000042C000-000000067F00004005016EA00C0000478000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000FF8000-000000067F00004005000060FB0001000B44__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000169968A-000000067F00004005000060FB00016D21CF__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005F821C-000000067F00004005000060F20100000000__000000636DE92159-000000663565F8C9":{"file_size":149954560,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7C000-000000067F00004005016EA00C0001E03DD8__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F0000400500F678390000058000-000000067F0000400500F67839000005C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003A7E20-000000067F0000400500EB4A4800003BFD31__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001228000-000000067F00004005016EA00C000122C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000F0C0E9-000000067F000040050081DB430000F4E15B__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000758000-000000067F00004005000060F80100000000__0000006DDB29D589-000000722F474369":{"file_size":264781824,"generation":2,"shard":"0008"},"000000067F00004005000060F300068640AF-000000067F00004005000060F3000686D0DE__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000047C000-000000067F00004005016EA00C0000498000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006166575-000000067F00004005000060F3000616F6B2__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B18000-000000067F00004005000060F70001B1C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700016EC000-000000067F00004005000060F70001708000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005CCF3C5-000000067F00004005000060F30005D184F6__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002848000-000000067F00004005000060F3000285901B__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039C0000-000000067F00004005000060F300039C4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002464000-000000067F00004005000060F30002480000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D0000-000000067F00004005016EA00C00011D4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003D44283-000000067F00004005000060F30003D952B0__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480100000000-000000067F0000400500EE16BC0000044000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000533205E-000000067F00004005000060F300053E30C3__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000009A255-000000067F00004005000060F60300000000__0000017CC2FD7288":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B00000-000000067F00004005000060F70001B04000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004958000-000000067F00004005000060F3000495C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000518000-000000067F00004005000060F80100000000__0000004C49155071-0000004F31878919":{"file_size":262373376,"generation":2,"shard":"0008"},"000000067F00004005000060F300064D8000-000000067F00004005000060F3000658113F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000014000-000000067F0000400500FDA1F80000020D42__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000284000-000000067F00004005000060FB00002D4B6A__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CDBB9C-000000067F00004005000060F80100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":148865024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001298000-000000067F00004005016EA00C000129C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001DD8000-000000067F00004005000060FB0001DF0B43__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001220000-000000067F00004005000060F70001224000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002908000-000000067F00004005000060F30002920FA0__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F5C000-000000067F00004005016EA00C0000F90000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001E03DD8-030000000000000000000000000000000002__000001BCB572A4E0":{"file_size":139264,"generation":17,"shard":"0008"},"000000067F00004005000060F30003998000-000000067F00004005000060F3000399C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014E75C6-030000000000000000000000000000000002__000001A931C135B1-000001AC25760149":{"file_size":51486720,"generation":11,"shard":"0008"},"000000067F00004005010660F500000F44CB-000000067F00004005010660F70100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003FC000-000000067F00004005016EA00C0000400000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F30003849093__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B00000-000000067F00004005000060F30006B10FFF__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001541688-000000067F00004005000060FB000154A1CD__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001098000-000000067F00004005000060FB000109C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700011912D4-000000067F00004005000060F80100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A40000-000000067F00004005000060F30002A44000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001448000-000000067F00004005000060F300014B0F7B__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001009688-000000067F00004005000060FB000102A1CE__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A4000-000000067F0000400500EE16BC00001E0000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B58B45-000000067F00004005000060FB0000B6168A__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000AC000-000000067F0000400500D69D7900000BDAF5__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000193A10B-000000067F00004005000060F30100000000__00000075CC373F31-00000079F2A2F311":{"file_size":198148096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A0000-000000067F00004005016EA00C00005A4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700000E0000-000000067F00004005000060F80100000000__0000000D80565628":{"file_size":112009216,"generation":2,"shard":"0008"},"000000067F00004005000060F3000690F2FD-000000067F00004005000060F300069883DB__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004C6B83-000000067F00004005000060F60100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E18000-000000067F00004005000060F30001E50FF3__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B4000-000000067F00004005000060F300043B8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100006C0000-000000067F00004005000060F20100000000__000000722F474369-00000075CC373F31":{"file_size":267665408,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A78000-000000067F00004005000060F70000A7C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011C1688-000000067F00004005000060FB00011CA1CD__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004E8000-000000067F00004005016EA00C00004EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000257A6F-000000067F00004005016EA00C000029F90B__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001590000-000000067F00004005000060FB0001594000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193189A-030000000000000000000000000000000002__000001B3F17FE4E0":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F300027C4000-000000067F00004005000060F30002828000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B40000-000000067F00004005016EA00C0000B44000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006694000-000000067F00004005000060F300066F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015C8000-000000067F00004005000060FB00015CC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B84000-000000067F00004005000060F30003B90000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006704000-000000067F00004005000060F30006748000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000793506-030000000000000000000000000000000002__0000002427BD8BD0":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004F1638A-000000067F00004005000060F30100000000__000001440D3D0C69-0000014784964B91":{"file_size":93708288,"generation":2,"shard":"0008"},"000000067F00004005000060F80100000000-000000067F00004005000060FB0000014000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000180000-000000067F00004005000060F70000184000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A2693B-000000067F00004005000060F30004A7F98F__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C71F27-000000067F00004005000060F30002C9AFB8__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300038075AF-000000067F00004005000060F30100000000__000000FF8B261599-000001048B25A8E9":{"file_size":49823744,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000028000-000000067F0000400500DBCED5000002C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004188000-000000067F00004005000060F300041D9101__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30006868000-000000067F00004005000060F50100000000__00000178C5D5D3A8":{"file_size":116645888,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A789A0-000000067F00004005000060F30003AB9907__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000368000-000000067F0000400500EB4A48000036FF11__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300047EC0CA-000000067F00004005000060F300047F5138__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB8B97-000000067F00004005000060F70001AC115C__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D61283-000000067F00004005000060F70000D8985C__000000C462B3C2A9-000000C824C09619":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300011D1111-000000067F00004005000060F3000122A1D5__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001967D34-000000067F00004005016EA00C000197FBD0__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FA2AD3000004D85C-000000067F0000400500FB3D300100000000__0000010D77B487A0":{"file_size":31309824,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005BCFD7-000000067F000040050081DB4300005D704F__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000004000-000000067F00004005000060F100000260F2__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F0000400500EE16BC00000F8000-000000067F0000400500EE16BC000014158C__000000F901689359-000000FCCD5238B1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000921E8A-000000067F00004005000060F60100000000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":228564992,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001190000-000000067F00004005000060FB0001198B44__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A0000-000000067F00004005000060F300067A4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000200000-000000067F00004005000060F10000204000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0FBB-000000067F00004005000060F3000407201D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000001C000-000000067F00004005000060F3000008228D__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CD7376-030000000000000000000000000000000002__000001B6FFE46BC9-000001BA93C39481":{"file_size":70238208,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000EBC000-000000067F00004005000060FB0000EC8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000293210E-000000067F00004005000060F30002983166__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000151F271-000000067F00004005000060F30100000000__000000636DE92159-000000663565F8C9":{"file_size":41271296,"generation":2,"shard":"0008"},"000000067F00004005000060F30004880000-000000067F00004005000060F30004884000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000518222-000000067F00004005000060F20100000000__0000005413AB3641-00000057593D8169":{"file_size":169492480,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E0000-000000067F00004005016EA00C00003E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000775A02-000000067F00004005000060F60100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000197FBD0-000000067F00004005016EA00C00019C7A6A__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000067114B-000000067F00004005000060F60100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":232669184,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408000-000000067F00004005000060FB000140C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001F8000-000000067F0000400500EB4A4800001FC000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000290000-000000067F0000400500EB4A480000294000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003061089-000000067F00004005000060F3000306A02D__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE4000-000000067F00004005000060F30001CF0197__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E20000-000000067F00004005000060F70000E24000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D0000-000000067F000040050081DB4300001D4000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D184F6-000000067F00004005000060F30100000000__0000016143292911-00000164DEE06671":{"file_size":200163328,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F4000-000000067F00004005000060F30006700000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A38000-000000067F000040050081DB430000A4A074__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F38000-000000067F00004005000060F30000F59017__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C0C000-000000067F00004005000060FB0000C18000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D34000-000000067F00004005000060F30006D60000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700013E85D1-000000067F00004005000060F70001410BBC__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000538B44-000000067F00004005000060FB0000551689__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410000-000000067F00004005000060F70001414000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300032F1113-000000067F00004005000060F3000330A1C8__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004974000-000000067F00004005000060F3000498DC49__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625EB45-000000067F00004005000060F30006277C61__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700019E8E81-000000067F00004005000060F80100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":246792192,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5730259-000001BCB5732691":{"file_size":24576,"generation":187,"shard":"0008"},"000000067F000040050081DB4300001CC000-000000067F000040050081DB4300001D0000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C00000-000000067F00004005000060F30002C18FAE__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC4000-000000067F00004005000060F70000FCD85E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000107C39B-030000000000000000000000000000000002__0000004C49155071-0000004F31878919":{"file_size":133349376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F90000-000000067F00004005016EA00C0000F94000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000F98000-000000067F00004005016EA00C0000F9C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700019EC000-000000067F00004005000060F80100000000__0000014EDD256548":{"file_size":7421952,"generation":2,"shard":"0008"},"000000067F00004005000060F300069FA3F6-000000067F00004005000060F30006A0B44C__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AC000-000000067F000040050081DB4300003B27DA__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A57691-000000067F00004005000060F30005B00697__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300060CB2C8-000000067F00004005000060F300060D4415__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000495C000-000000067F00004005000060F30004970000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000D1C5F-000000067F0000400500D69D7900000F1B5B__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001358000-030000000000000000000000000000000002__000001A95031E5B8":{"file_size":21110784,"generation":11,"shard":"0008"},"000000067F00004005000060F3000430C000-000000067F00004005000060F30004370000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004904000-000000067F00004005000060F30004958000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000008000-000000067F00004005000060F30000378000__00000186146441F1-0000018624969469":{"file_size":33357824,"generation":6,"shard":"0008"},"000000067F00004005000060F700005C0000-000000067F00004005000060F700005C85CE__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B04000-000000067F00004005016EA00C0000B40000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002920FA0-000000067F00004005000060F3000293210E__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002058000-000000067F00004005000060F30002070F71__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000686D0DE-000000067F00004005000060F3000689E295__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000004000-000000067F0000400500FA2AD30000030000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009BF728-000000067F00004005016EA00C0000A575C7__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30004374000-000000067F00004005000060F300043B0000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F0000-000000067F00004005000060F300051F4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B22072-000000067F00004005000060F30006B4B0BB__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000328FA4E-000000067F00004005000060F50100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000000FEA0-000000067F00004005016EA00C000001FD3E__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000019F4DD-030000000000000000000000000000000002__000000F6661C9241-000000F901689359":{"file_size":59498496,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003EC000-000000067F00004005016EA00C00003F8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000073C000-000000067F00004005016EA00C000074F43B__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003542BFF-000000067F00004005000060F50100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001771169-000000067F00004005000060F80100000000__000001398B56A519-0000013C9C0E3339":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003B27DA-030000000000000000000000000000000002__0000008DDCD70B68":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000542AFB0-000000067F00004005000060F30005474062__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000057C94F-000000067F00004005000060F80100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300055861F2-000000067F00004005000060F30100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":127393792,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D79136-000000067F00004005000060F30100000000__0000008DBE2855F9-000000923719A971":{"file_size":227958784,"generation":2,"shard":"0008"},"000000067F00004005000060F10000218000-000000067F00004005000060F1000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD4000-000000067F00004005016EA00C0001CE0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300017EC000-000000067F00004005000060F30001886B2A__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001188000-000000067F00004005000060F300011D1111__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000ECC000-000000067F00004005000060FB0000F050F2__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018C0000-000000067F00004005000060F300018E0FE6__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E4000-000000067F00004005016EA00C0000738000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002790000-000000067F00004005000060F30002794000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00001B850B-000000067F0000400500F56D510100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F100001F8000-000000067F00004005000060F100001FC000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000810000-000000067F00004005000060F80100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F00004005000060F100006CBF87-000000067F00004005000060F20100000000__000000A5A3F27398":{"file_size":15851520,"generation":2,"shard":"0008"},"000000067F0000400500F7D2DD0100000000-000000067F0000400500F8E3A50000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010AABC7-000000067F00004005000060F80100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B80000-000000067F00004005000060F30003B84000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000078000-000000067F000040050081DB4300000AA080__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002618000-000000067F00004005000060F30002680F9D__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A48000-000000067F00004005000060F30002A4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001994000-000000067F00004005000060F700019E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6168A-000000067F00004005000060FB0000B6A1D0__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000147A0EC-000000067F00004005000060FB000148AC30__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000060000-000000067F0000400500EE16BC0000064000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003458D42-000000067F00004005000060F30003481DDB__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E30000-000000067F00004005000060F30006E34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017F8000-000000067F00004005000060F700017FC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C50000-000000067F00004005000060F30004C54000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F80100000000__00000139CF156B58":{"file_size":63463424,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A8E15E-000000067F000040050081DB430000A98000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":265404416,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BAE526-000000067F00004005000060F30004BE7584__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF97B-000000067F00004005016EA00C0001B0FD2A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F60000014000-000000067F00004005000060F60100000000__0000003D2AB09B68":{"file_size":83329024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C1C000-000000067F00004005000060FB0000C70000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005240000-000000067F00004005000060F30005244000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000077C000-000000067F000040050081DB430000790000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D60000-000000067F00004005000060F30006D64000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C54000-000000067F00004005000060F30004C60000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30005688000-000000067F00004005000060F3000568C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004370000-000000067F00004005000060F30004374000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F4000-000000067F00004005000060F30005210000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004DDC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001AFD31-000000067F0000400500C782E400001B7C41__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000BB103B-000000067F00004005000060F60000014C3A__0000003579F03331-0000003959DA2DE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D19D030100000000-000000067F0000400500D69D790000024000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000028B253-030000000000000000000000000000000002__0000008196C976A1-0000008625CF2891":{"file_size":151224320,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004E40FFC__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F44EB0100000000-000000067F00004005010F57CB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BCC000-000000067F00004005000060F30003C08000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B80000-000000067F00004005000060F30005B89170__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000135FCAD-000000067F00004005016EA00C000144FB4E__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005010660F500000B0000-000000067F00004005010660F500000B4000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000D31030-000000067F00004005000060F30100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":233791488,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C18FAE-000000067F00004005000060F30002C71F27__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000041FB53-000000067F0000400500EB4A480000447A64__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000048000-000000067F0000400500EE16BC000004C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D0000-000000067F00004005000060FB00009D4000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004365FE-000000067F00004005000060F20100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006BAD108-000000067F00004005000060F30006C0E146__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B4000-000000067F00004005000060F300006E0000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000327C000-000000067F00004005000060F3000328FA4E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B94000-000000067F00004005000060F30003BC8000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CB8FCF-000000067F00004005000060F30003CCA0B9__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003EA902F-000000067F00004005000060F30003F72201__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C64000-000000067F00004005000060F30004C80000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000194000-000000067F000040050081DB4300001C8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__0000018613A0DEA9-00000186146441F1":{"file_size":73728,"generation":5,"shard":"0008"},"000000067F00004005000060F300038B5F5B-000000067F00004005000060F300038FF04F__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001C8000-000000067F000040050081DB4300001CC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000137F10-000000067F0000400500C782E40000177E20__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000139C000-000000067F00004005000060FB00013B8000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000447A64-000000067F0000400500EB4A480100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":40550400,"generation":2,"shard":"0008"},"000000067F00004005000060F70000418000-000000067F00004005000060F700004405CF__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000728000-000000067F000040050081DB43000072C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B0F7B-000000067F00004005000060F30100000000__000000601F43CF09-000000636DE92159":{"file_size":83951616,"generation":2,"shard":"0008"},"000000067F00004005000060F30005F3303F-000000067F00004005000060F30005FA40AD__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300012442A9-000000067F00004005000060F3000129D29A__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010B14AB-000000067F000040050081DB430100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014CF88D-000000067F00004005016EA00C00014D7727__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006A0B44C-000000067F00004005000060F30006A7C566__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000062EE46-000000067F00004005000060F20100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE0000-000000067F00004005016EA00C0001CE4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30000250000-000000067F00004005000060F30000254000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050E8000-000000067F00004005000060F300050EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000259F4A3-000000067F00004005000060F30100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":44433408,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A640EA-000000067F000040050081DB430000A8E15E__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003050000-000000067F00004005000060F30003061089__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C0000158000-000000067F0000400500F3A25C000016A065__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A4000-000000067F000040050081DB4300010B14AB__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E0000-000000067F0000400500EE16BC00001E4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055B8000-000000067F00004005000060F300055BC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE4000-000000067F00004005016EA00C0000D30000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003640000-000000067F00004005000060F30003644000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000014F7AC-000000067F0000400500EB4A4800001876BD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD338E-000000067F00004005016EA00C0001CE79E0__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060FB0001530B44-000000067F00004005000060FB0001541688__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031D516C-000000067F00004005000060F30100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":137863168,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019C7A6A-000000067F00004005016EA00C00019F7907__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000E7F7A7-000000067F00004005016EA00C0000F3F647__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300032C0000-000000067F00004005000060F300032F1113__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E0000-000000067F00004005016EA00C00006E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000019EA78-000000067F00004005000060F80100000000__0000001737D88379-0000001B59EEB909":{"file_size":50946048,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B4FBC9-000000067F00004005016EA00C0001BBFA66__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001660000-000000067F00004005000060FB0001680B45__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002BAA1DD-000000067F00004005000060F30100000000__000000C462B3C2A9-000000C824C09619":{"file_size":203554816,"generation":2,"shard":"0008"},"000000067F00004005000060F300049B26A8-000000067F00004005000060F300049CB712__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CCB5CD-000000067F00004005000060F70000CDBB9C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EEA075-000000067F000040050081DB430000F0C0E9__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E0000-000000067F00004005000060F300003E8FBC__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C9C000-000000067F00004005000060F30006CA0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C7C000-000000067F00004005000060F70000C8CD0C__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001148000-000000067F00004005000060FB000114C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001232ACF-000000067F00004005000060F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FE8000-000000067F00004005000060F700010105DB__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000355928-000000067F0000400500EB4A480100000000__000000FCD84FE628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700003FE341-000000067F00004005000060F80100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000244D189-000000067F00004005000060F30100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":212566016,"generation":2,"shard":"0008"},"000000067F00004005000060F700003B85C7-000000067F00004005000060F80100000000__0000003579F03331-0000003959DA2DE9":{"file_size":208945152,"generation":2,"shard":"0008"},"000000067F00004005000060F100005A2B80-000000067F00004005000060F20100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB000070C000-000000067F00004005000060FB0000718000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB000180C000-000000067F00004005000060FB0001838000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000044000-000000067F0000400500EE16BC0000048000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10100000000-000000067F00004005000060F10300000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":483328,"generation":2,"shard":"0008"},"000000067F00004005000060F30004EA41A5-000000067F00004005000060F30004EC52E9__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AB9907-000000067F00004005000060F30003AF28CB__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000974000-000000067F00004005000060FB00009D0000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038720A2-000000067F00004005000060F300038A3082__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000452BA1-000000067F000040050081DB4300004C4C1E__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017AA0CE-000000067F00004005000060F30100000000__0000006DDB29D589-000000722F474369":{"file_size":202719232,"generation":2,"shard":"0008"},"000000067F000040050081DB430000504000-000000067F000040050081DB430000560000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B5431C-000000067F00004005000060F30004B654F6__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C20000-000000067F00004005000060F30000C24000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300028920E4-000000067F00004005000060F30100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":200351744,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004C4C1E-030000000000000000000000000000000002__000000923719A971-00000096262826C9":{"file_size":192356352,"generation":2,"shard":"0008"},"000000067F000040050081DB430000190000-000000067F000040050081DB430000194000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E88000-000000067F000040050081DB430000E8C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000738000-000000067F00004005016EA00C000073C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000578EE6-000000067F000040050081DB43000058AF5E__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C38000-000000067F00004005000060F30001C3C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B7C0EA-030000000000000000000000000000000002__000000B2B5C4E8F9-000000B768469051":{"file_size":133464064,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625B8F0-000000067F00004005000060F30100000000__0000016B49A934C1-0000016E1FBB7B99":{"file_size":139640832,"generation":2,"shard":"0008"},"000000067F00004005000060FB000109C000-000000067F00004005000060FB0001110000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572DFF9-000001BCB5730259":{"file_size":24576,"generation":41,"shard":"0008"},"000000067F00004005000060FB0000AA8000-000000067F00004005000060FB0000AD0B45__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043F8000-000000067F00004005000060F300043FC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003C7C42-000000067F0000400500EB4A48000041FB53__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BA213F-000000067F00004005000060F30005BDB15B__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300063FE10E-000000067F00004005000060F30100000000__0000016E1FBB7B99-000001715E483C79":{"file_size":111067136,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F91FFF-000000067F00004005000060F30000F9B026__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003650000-000000067F00004005000060F30003654000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050A412B-000000067F00004005000060F300050B5199__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D78000-000000067F00004005016EA00C0001D7C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005016EA00C0001244000-000000067F00004005016EA00C0001298000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100001FC000-000000067F00004005000060F10000200000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA0000-000000067F00004005016EA00C0000CA4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F3000498DC49-000000067F00004005000060F50100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000036EA0-000000067F00004005000060F60100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000928B45-000000067F00004005000060FB000097168A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006854000-000000067F00004005000060F30006858000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C3F5-030000000000000000000000000000000002__00000117EDA82C11-0000011B632CC319":{"file_size":226066432,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A6D1B3-000000067F00004005000060F30100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":117620736,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D2C000-000000067F00004005000060F30002D80000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A31FB6-000000067F00004005000060F30003A3B020__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000160723E-000000067F00004005016EA00C00016570D9__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FB3D310000018000-000000067F0000400500FB3D31000001C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001708000-000000067F00004005000060F7000170C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000283C3E7-000000067F00004005000060F50100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00018F0000-000000067F00004005000060FB0100000000__00000075CC373F31-00000079F2A2F311":{"file_size":268959744,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EC8000-000000067F00004005000060FB0000ECC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F9C000-000000067F00004005016EA00C0000FF0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002680F9D-000000067F00004005000060F3000274A080__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000679C000-000000067F00004005000060F300067A0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000428313F-000000067F00004005000060F300042CC1BD__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00000FFFFFFFF-030000000000000000000000000000000002__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB00017D8000-000000067F00004005000060FB00017DC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017FC000-000000067F00004005000060F70001828000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FD317C-000000067F00004005000060F30002FF427D__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001701588-000000067F00004005000060FB00017120CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500000A3000FFFFFFFF-000000067F0000400500000A690000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":40960,"generation":17,"shard":"0008"},"000000067F00004005000060FB0000638B45-030000000000000000000000000000000002__0000001B59EEB909-0000001FFBC01501":{"file_size":252010496,"generation":2,"shard":"0008"},"000000067F000040050081DB430000394000-000000067F000040050081DB4300003A8000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CF0197-000000067F00004005000060F50100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000DFB51-000000067F0000400500EB4A4800000E7A62__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014C000-000000067F00004005000060F70000180000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005948000-000000067F00004005000060F300059790CD__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000853115-000000067F00004005000060F60100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":176136192,"generation":2,"shard":"0008"},"000000067F00004005000060F30004884000-000000067F00004005000060F30004888000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000513C000-000000067F00004005000060F30005160000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000017C000-000000067F0000400500F3A25C00001B850B__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006382F14-000000067F00004005000060F3000638C06D__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000017F02-000000067F0000400500E3A2A100000B7E04__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001000B44-000000067F00004005000060FB0001009688__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790100000000-000000067F0000400500DBCED50000024000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A0000-000000067F000040050081DB4300010A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000310000-000000067F00004005000060FB0000348B45__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000060038-000000067F00004005000060F60100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE0000-000000067F00004005000060F30001CE4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000AA080-000000067F000040050081DB4300000D40FF__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000551689-030000000000000000000000000000000002__0000001737D88379-0000001B59EEB909":{"file_size":227418112,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000D90000-000000067F00004005000060FB0100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":272769024,"generation":2,"shard":"0008"},"000000067F00004005000060F300059CC403-000000067F00004005000060F300059F53C6__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F2C000-000000067F00004005000060F30001F30000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000014000-000000067F00004005000060FB0000084772__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F30004B654F6-000000067F00004005000060F30004BAE526__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002450000-000000067F00004005000060F30002454000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A0F066-000000067F00004005000060F50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000032EBE-000000067F00004005000060F60100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001D8000-000000067F00004005000060FB00001DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000670000-000000067F00004005016EA00C0000674000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001344000-000000067F00004005016EA00C0001358000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D30000-000000067F00004005016EA00C0000D34000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000012FE9A-000000067F00004005016EA00C00001F7D38__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000BF0000-000000067F00004005000060F70100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":273809408,"generation":2,"shard":"0008"},"000000067F00004005000060F300005A0000-000000067F00004005000060F3000067114B__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000021C000-000000067F0000400500EB4A480000290000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F3C000-000000067F00004005016EA00C0000F58000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000074F43B-030000000000000000000000000000000002__000001936E73D028":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005010F57CB000000C000-000000067F00004005010F99A50100000000__00000126C3C69FC0":{"file_size":22978560,"generation":2,"shard":"0008"},"000000067F00004005000060F700017E1391-000000067F00004005000060F80100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":232677376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CC74D7-000000067F00004005016EA00C0001CD7376__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700005C85CE-000000067F00004005000060F700005E8B9D__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FCD352-000000067F00004005000060F30100000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":124788736,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002A5E4B-000000067F0000400500C782E400002CDD5C__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700018871D6-000000067F00004005000060F80100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D252C8-000000067F00004005000060F30100000000__00000117EDA82C11-0000011B632CC319":{"file_size":205963264,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408A62-000000067F00004005000060FB00014195A7__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001B7C41-000000067F0000400500C782E400001C7B51__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000110000-000000067F00004005000060FB0100000000__000000044854EBD1-00000008B6B51879":{"file_size":272613376,"generation":2,"shard":"0008"},"000000067F00004005000060F300004E8000-000000067F00004005000060F60100000000__0000001737D88379-0000001B59EEB909":{"file_size":260579328,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF4000-000000067F00004005000060F30006E30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C84000-030000000000000000000000000000000002__000000BAC0041E18":{"file_size":59998208,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B88FF2-000000067F00004005000060F30002BAA1DD__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000434000-000000067F00004005000060FB00004A0000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DA8000-000000067F00004005000060F30004DAC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E0000-000000067F000040050081DB4300004E4000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E4000-000000067F0000400500EE16BC0000201716__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C440EA-000000067F000040050081DB430000C5E15B__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000BDAF5-000000067F0000400500D69D790100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A9C000-000000067F00004005000060F30002AEED02__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DAC000-000000067F00004005000060F30004DD8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B94000-000000067F00004005000060F70000B98000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002454000-000000067F00004005000060F30002460000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100001059CB-000000067F00004005000060F10000125BF2__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D362CA-000000067F00004005016EA00C0000DB7D33__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001C0A0A3-000000067F00004005000060F30100000000__0000008625CF2891-00000089F4693119":{"file_size":203063296,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F0000-000000067F00004005000060F300066F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001414000-000000067F00004005000060F70001428000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014CC16D-000000067F00004005000060F300014D5280__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000172AC12-030000000000000000000000000000000002__0000006DDB29D589-000000722F474369":{"file_size":186875904,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E4C000-000000067F000040050081DB430000E88000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300063A50CD-000000067F00004005000060F300063FE10E__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005419E9C-000000067F00004005000060F3000542AFB0__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000014158C-030000000000000000000000000000000002__000000F901689359-000000FCCD5238B1":{"file_size":67854336,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015FF3A0-000000067F00004005016EA00C000160723E__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00008E760F-000000067F00004005016EA00C00009274AB__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B98000-000000067F00004005000060F70000B9C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A4000-000000067F00004005000060FB00004E1FF6__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006670000-000000067F00004005000060F30006674000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000185EE9-000000067F00004005000060F7000018E4B6__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000067CA9-030000000000000000000000000000000002__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":29319168,"generation":2,"shard":"0008"},"000000067F0000400500FF2A51000000BFFB-030000000000000000000000000000000002__0000010D77B487A0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A048A8-000000067F00004005000060F30004A1D870__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004BC000-000000067F00004005000060F300004C6B83__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005290FC9-000000067F00004005000060F3000533205E__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031130BC-000000067F00004005000060F300031C40D1__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000047EE2-000000067F0000400500D19D03000004FDC6__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A44000-000000067F00004005000060F30002A48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DAE2DC-000000067F00004005000060F30003DD734C__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50000014000-000000067F0000400500F8E3A5000004A25C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100002F03E9-000000067F00004005000060F20100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001138000-000000067F00004005000060F80100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":72695808,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E4000-000000067F00004005000060F50100000000__00000159B010F6C0":{"file_size":13393920,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A7C000-000000067F00004005000060F70000ABD9C4__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000CC6E51-030000000000000000000000000000000002__0000003D2AB09B68":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F60000091EFF-000000067F00004005000060F60100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000008FC41-000000067F0000400500EB4A4800000DFB51__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F363B4-000000067F00004005000060F30001F574A6__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD0000-000000067F00004005016EA00C0001CD4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300059B324D-000000067F00004005000060F300059CC403__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002530000-000000067F00004005000060F30002534000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000004B633-000000067F00004005000060F60100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700011E0000-000000067F00004005000060F80100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":262922240,"generation":2,"shard":"0008"},"000000067F00004005000060F30006690000-000000067F00004005000060F30006694000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000100E18-000000067F00004005000060F700001213F2__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000004000-000000067F0000400500FF2A51000000BFFB__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EB8000-000000067F00004005000060FB0000EBC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000674000-000000067F00004005016EA00C00006B0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000EF85D6-000000067F00004005000060F80100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":262897664,"generation":2,"shard":"0008"},"000000067F00004005000060F700005E8B9D-000000067F00004005000060F700005F9158__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E40FFC-000000067F00004005000060F30004E7A062__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000037E20-000000067F0000400500EB4A480000057D31__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400501101C0901FFFFFFFF-030000000000000000000000000000000002__0000012E71CF31F9-000001334140FC21":{"file_size":65060864,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B10000-000000067F00004005000060F70100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":272646144,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E104B-000000067F00004005000060F3000570A19E__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300059790CD-000000067F00004005000060F300059AA115__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B54000-000000067F00004005000060F70000B90000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300041D9101-000000067F00004005000060F3000424A099__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000E085E-000000067F00004005000060F70000100E18__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B0000-000000067F00004005000060F300051B4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572A4E1-000001BCB572C329":{"file_size":24576,"generation":17,"shard":"0008"},"000000067F00004005000060F30006D30000-000000067F00004005000060F30006D34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000020D42-000000067F0000400500FDA1F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081D80C0100000000-000000067F000040050081DB430000024000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000235B4-000000067F00004005000060F60100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500C782E400000A0000-000000067F0000400500C782E400000A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002264247-000000067F00004005000060F50100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000302C2D6-000000067F00004005000060F50100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000129C000-000000067F00004005016EA00C0001340000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700016E8000-000000067F00004005000060F700016EC000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023A0000-000000067F00004005000060F300023B0FF7__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F20100000000-000000067F00004005000060F3000000C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000374000-000000067F00004005016EA00C00003E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000368000-000000067F00004005000060F80100000000__0000003203FB5749-0000003579F03331":{"file_size":263249920,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006310C9-030000000000000000000000000000000002__0000009A1ABDE921-0000009DF02C1241":{"file_size":208953344,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DC8000-000000067F00004005000060FB0000DE8B45__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000530000-000000067F00004005000060FB0000538B44__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000024000-000000067F000040050081DB430000028000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000488C000-000000067F00004005000060F30004898000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300044D3639-000000067F00004005000060F50100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010450640000000570-000000067F0000400501046F39000000BDD2__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300021050B0-000000067F00004005000060F3000212E160__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700010DD440-000000067F00004005000060F80100000000__000000F309FCDD19-000000F6661C9241":{"file_size":91758592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AD0B45-000000067F00004005000060FB0000AE168A__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000013B18E-000000067F00004005000060F7000014B73D__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001938000-000000067F00004005016EA00C000193FE9D__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400000A4000-000000067F0000400500C782E4000012A71E__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001A40000-000000067F00004005000060F30001A44000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008578D4-000000067F00004005016EA00C00008CF772__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001CC0000-000000067F00004005000060F30001CC4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D20000-000000067F00004005000060F30004D24000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E8000-000000067F00004005016EA00C00003EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300039C4000-000000067F00004005000060F300039F8000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005164000-000000067F00004005000060F300051B0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039F8000-000000067F00004005000060F300039FC000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010F46BD-000000067F000040050081DB430100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":113999872,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E630CF-000000067F00004005000060F30100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":171999232,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ACF305-000000067F00004005016EA00C0000ADF1AB__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006748000-000000067F00004005000060F3000674C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F50100000000__00000104BD37F348":{"file_size":11739136,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021C000-000000067F00004005000060F20100000000__0000002427BD8BD0":{"file_size":132448256,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017EC000-000000067F00004005016EA00C00018C0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000025DA3C-000000067F00004005000060F80100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00007F0000-000000067F00004005000060FB0000860B45__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0000-000000067F00004005000060F30003FF4000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E0AD15-000000067F00004005000060FB0000E1B859__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010ADFA80000004000-000000067F00004005010F2BD40100000000__00000126C3C69FC0":{"file_size":13369344,"generation":2,"shard":"0008"},"000000067F00004005000060F30004898000-000000067F00004005000060F3000489C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D2B1B0-000000067F00004005000060F30003D44283__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF4000-000000067F00004005016EA00C0001188000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005010F99A50100000000-000000067F00004005010F9F120000004000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F34000-000000067F00004005000060F30001F38F48__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700018A0000-000000067F00004005000060F700018D85CA__000001440D3D0C69-0000014784964B91":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300029A526C-000000067F00004005000060F300029C623C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017DC000-000000067F00004005000060FB0001808000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000024000-000000067F0000400500DBCED50000028000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000201716-000000067F0000400500EE16C40100000000__0000012A77C1B0B0":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D10000-000000067F00004005000060F30006D14000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430001064000-000000067F000040050081DB4300010A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001340000-000000067F00004005000060F30001344000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003E98000-000000067F00004005000060F30003EA902F__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C0E146-000000067F00004005000060F30006C8729E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000166C4-000000067F00004005000060F60100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":54165504,"generation":2,"shard":"0008"},"000000067F00004005000060F10000180000-000000067F00004005000060F1000018821A__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193FE9D-000000067F00004005016EA00C0001967D34__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB43000076C000-000000067F000040050081DB430000778000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050321C0-000000067F00004005000060F30005063187__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D4000-000000067F0000400500DBCED500000F0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004B8000-000000067F00004005000060F300004BC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000022C000-000000067F00004005000060FB0000280000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DF968A-000000067F00004005000060FB0000E021D0__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000228000-000000067F00004005000060FB000022C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015D8000-000000067F00004005000060FB00015DC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B89170-000000067F00004005000060F30005BA213F__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B0000-000000067F00004005000060F300043B4000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004F8000-000000067F000040050081DB4300004FC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006860000-000000067F00004005000060F30006864000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000ADA0D0-000000067F00004005000060F30000B0300C__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000000000-000000067F000040050100D04D000004369C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000BB439-030000000000000000000000000000000002__00000104BD37F348":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C078FA-000000067F00004005016EA00C0001C0F79A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430000B4A075-000000067F000040050081DB430000B7C0EA__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000117C10C-000000067F00004005000060F50100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E47BD2-000000067F00004005016EA00C0000E67A6E__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30005D23BB5-000000067F00004005000060F50100000000__00000164EA9EC9A8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000336D193-000000067F00004005000060F3000337DCF3__000000E4C63CFA21-000000E7C2F1B249":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F0000-000000067F00004005000060F300001F4000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000084772-030000000000000000000000000000000002__000000027AF9D7D0":{"file_size":147456,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CE79E0-000000067F00004005016EA00C0001D1F87B__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F0000400500EB4A4800FFFFFFFF-000000067F0000400500EB4A480100000000__000000FF8B261599-000001048B25A8E9":{"file_size":1318912,"generation":2,"shard":"0008"},"000000067F00004005000060F70000488000-000000067F00004005000060F7000048C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ADF1AB-000000067F00004005016EA00C0100000000__00000196C9018F59-0000019A2EAFE7A9":{"file_size":282132480,"generation":11,"shard":"0008"},"000000067F00004005000060FB000071C000-000000067F00004005000060FB0000793506__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006850000-000000067F00004005000060F30006854000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000390000-000000067F000040050081DB430000394000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000020C000-000000067F00004005000060F30000250000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001398000-000000067F00004005000060FB000139C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003648000-000000067F00004005000060F3000364C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001C7B51-000000067F0000400500C782E4000023FA62__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001788000-000000067F00004005016EA00C000178C000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000C3A075-000000067F000040050081DB430000C440EA__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036FE561-000000067F00004005000060F300038075AF__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D03000004FDC6-000000067F0000400500D19D030000067CA9__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C00000-000000067F00004005000060FB0000C04000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000282C000-000000067F00004005000060F3000283C3E7__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B0000-000000067F00004005016EA00C00006B4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001789027-000000067F00004005000060F300017AA0CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004558000-000000067F00004005000060F300045C1062__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C08000-000000067F00004005000060FB0000C0C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DCC000-000000067F00004005000060F30006DF0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B221FE-000000067F00004005000060F30004B2B250__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C4000-000000067F00004005016EA00C00018E0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000564000-000000067F000040050081DB430000578000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274A080-000000067F00004005000060F30100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":199057408,"generation":2,"shard":"0008"},"000000067F00004005000060F300046D0EA8-000000067F00004005000060F3000471200E__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001114000-000000067F00004005000060FB0001120000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FEC000-000000067F00004005000060F30003FF0000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000368000-000000067F00004005000060F10100000000__0000003959DA2DE9-0000003D03FCCDB9":{"file_size":269967360,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000012A71E-030000000000000000000000000000000002__000000D037B2DBD0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C98000-000000067F00004005000060F30006C9C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055BC000-000000067F00004005000060F30005610000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F050F2-030000000000000000000000000000000002__00000047F1F2B800":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30002484000-000000067F00004005000060F300024D8000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FE8000-000000067F00004005000060F30003FEC000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000A8000-000000067F0000400500DBCED500000AC000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006C3D76-000000067F00004005000060F80100000000__000000663565F8C9-000000698AF6E809":{"file_size":139821056,"generation":2,"shard":"0008"},"000000067F00004005000060F30002534000-000000067F00004005000060F3000253B7A3__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000412D27C-000000067F00004005000060F30004156457__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000910000-000000067F00004005000060F700009385D4__0000008DBE2855F9-000000923719A971":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30002510000-000000067F00004005000060F30002514000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002210000-000000067F00004005000060F30002214000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF4000-000000067F00004005000060F30004070000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001BBFA66-000000067F00004005016EA00C0001C078FA__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000424A099-000000067F00004005000060F3000428313F__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036F91FE-000000067F00004005000060F30100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":164118528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000718000-000000067F00004005000060FB000071C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010F44EB000000C000-000000067F00004005010F44EB0100000000__00000126C3C69FC0":{"file_size":70696960,"generation":2,"shard":"0008"},"000000067F00004005000060F30005214000-000000067F00004005000060F30005240000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000A7AF6E-030000000000000000000000000000000002__000000321AA80270":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005063187-000000067F00004005000060F300050A412B__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005E8000-000000067F00004005000060F100005F821C__000000636DE92159-000000663565F8C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300020830BE-000000067F00004005000060F300020FC052__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065BB235-000000067F00004005000060F300065F42B4__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000034000-000000067F0000400500FA2AD3000004D85C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017A8000-000000067F00004005016EA00C00017AC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00008D8000-000000067F00004005000060FB0000928B45__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000798000-000000067F00004005000060F300007C1007__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000040000-000000067F0000400500D19D030000047EE2__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AB1583-000000067F00004005000060F50100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AD8000-000000067F00004005000060F30001B09104__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E1B859-030000000000000000000000000000000002__000000417D21ACF9-00000044B4679349":{"file_size":156844032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E9C000-000000067F00004005000060FB0001EA8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001374000-000000067F00004005000060FB0001398000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000155C000-000000067F00004005000060FB0001590000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000EA069-000000067F0000400500F3A25C000010C0D1__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000568C000-000000067F00004005000060F30005698000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C74000-000000067F00004005000060FB0000C98000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004F0000-000000067F00004005000060F80100000000__00000047E31D98D1-0000004C49155071":{"file_size":264921088,"generation":2,"shard":"0008"},"000000067F00004005000060F30005598000-000000067F00004005000060F3000559C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001429534-000000067F00004005000060F80100000000__00000122A7BB7B29-0000012694E36301":{"file_size":231964672,"generation":2,"shard":"0008"},"000000067F00004005000060F70000780000-000000067F00004005000060F80100000000__000000722F474369-00000075CC373F31":{"file_size":263340032,"generation":2,"shard":"0008"},"000000067F00004005000060F300019F31AA-000000067F00004005000060F30100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":168484864,"generation":2,"shard":"0008"},"000000067F000040050081DB430000822079-000000067F000040050081DB43000082C0F1__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007AC000-000000067F000040050081DB4300007F913A__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005847319-000000067F00004005000060F300058C8000__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":261505024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E21687-000000067F00004005000060FB0100000000__000000923719A971-00000096262826C9":{"file_size":224403456,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C98000-000000067F00004005000060F30003CB8FCF__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000045029C-030000000000000000000000000000000002__0000008DBE2855F9-000000923719A971":{"file_size":89505792,"generation":2,"shard":"0008"},"000000067F00004005000060F3000559C000-000000067F00004005000060F300055B8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000285901B-000000067F00004005000060F300028920E4__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E64000-000000067F00004005000060F30000E70000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015FB022-000000067F00004005000060F3000160410C__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FDA081-000000067F00004005000060F30100000000__00000184624E5741-000001860C80A151":{"file_size":202276864,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000107973-000000067F0000400500EE16BC0100000000__000000F309FCDD19-000000F6661C9241":{"file_size":275456000,"generation":2,"shard":"0008"},"000000067F00004005000060F300031C40D1-000000067F00004005000060F300031D516C__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00001F7D38-000000067F00004005016EA00C000020FBCF__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FDA1F80100000000-000000067F0000400500FF2A510000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001182EC9-000000067F00004005000060F80100000000__000000FF8B261599-000001048B25A8E9":{"file_size":174284800,"generation":2,"shard":"0008"},"000000067F00004005000060F700011528FB-000000067F00004005000060F70001182EC9__000000FF8B261599-000001048B25A8E9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300024DC000-000000067F00004005000060F30002510000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00000B0000-030000000000000000000000000000000002__000000021DC73119-000000044854EBD1":{"file_size":259375104,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001DF0B43-000000067F00004005000060FB0001E21687__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000088000-000000067F00004005000060F10000090000__00000008B6B51879-0000000D55A212C9":{"file_size":264142848,"generation":2,"shard":"0008"},"000000067F00004005000060F30003968000-000000067F00004005000060F3000396C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017AC000-000000067F00004005016EA00C00017E8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000019C73D-000000067F00004005000060F20100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":124698624,"generation":2,"shard":"0008"},"000000067F00004005000060F700001F8000-000000067F00004005000060F700002005D2__0000001B59EEB909-0000001FFBC01501":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001110000-000000067F00004005000060FB0001114000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000019842A-000000067F00004005000060F20100000000__0000001737D88379-0000001B59EEB909":{"file_size":145137664,"generation":2,"shard":"0008"},"000000067F00004005000060F700003BC000-000000067F00004005000060F700003C0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000280000-000000067F00004005000060FB0000284000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000007C000-000000067F0000400500DBCED500000A8000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5732691-000001BCB5734CD9":{"file_size":24576,"generation":239,"shard":"0008"},"000000067F00004005010660F70100000000-000000067F000040050107B547000006C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C24000-000000067F00004005000060F30000CA0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000569C000-000000067F00004005000060F300056D8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00000C7A73-030000000000000000000000000000000002__0000018624969469-000001880F984A29":{"file_size":40566784,"generation":11,"shard":"0008"},"000000067F00004005000060F30001344000-000000067F00004005000060F30001358000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F38F48-000000067F00004005000060F50100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001760000-000000067F00004005000060F30001789027__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000018821A-000000067F00004005000060F1000019842A__0000001737D88379-0000001B59EEB909":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300059AA115-000000067F00004005000060F300059B324D__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001400000-000000067F00004005000060FB0001404000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000E7A62-000000067F0000400500EB4A480000107973__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000498000-000000067F00004005000060F3000049C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D24000-000000067F00004005000060F70000D38000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000120E409-000000067F000040050081DB430300000000__0000018613F0A050":{"file_size":24576,"generation":3,"shard":"0008"},"000000067F00004005000060FB0001A8A1CD-000000067F00004005000060FB0100000000__0000007E3A9BFD29-0000008196C976A1":{"file_size":199622656,"generation":2,"shard":"0008"},"000000067F00004005000060F30006270000-000000067F00004005000060F50100000000__0000016E41E03CA0":{"file_size":71114752,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000BAAD15-030000000000000000000000000000000002__0000003579F03331-0000003959DA2DE9":{"file_size":182321152,"generation":2,"shard":"0008"},"000000067F00004005000060F700016205B5-000000067F00004005000060F80100000000__0000012E71CF31F9-000001334140FC21":{"file_size":266862592,"generation":2,"shard":"0008"},"000000067F00004005000060F300030C0FE5-000000067F00004005000060F30003102107__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004BC000-000000067F00004005016EA00C00004E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F10000440000-000000067F00004005000060F1000046821B__00000047E31D98D1-0000004C49155071":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009C8000-000000067F000040050081DB4300009CC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000106C000-000000067F00004005000060F700010AABC7__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000367733F-000000067F00004005000060F50100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000478000-000000067F00004005016EA00C000047C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002E4104A-000000067F00004005000060F30002E4A157__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001370000-000000067F00004005000060FB0001374000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B1111A-000000067F00004005000060F30004B221FE__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C000-000000067F00004005016EA00C00002D0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001C3C000-000000067F00004005000060F30001CC0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000136C000-000000067F00004005000060FB0001370000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000488000-000000067F00004005000060F10100000000__0000004C49155071-0000004F31878919":{"file_size":268754944,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B0300C-000000067F00004005000060F60100000000__0000003203FB5749-0000003579F03331":{"file_size":212885504,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C0F79A-000000067F00004005016EA00C0001C3F636__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000399C000-000000067F00004005000060F300039A0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001574000-000000067F00004005000060F700015A195C__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B00697-000000067F00004005000060F30100000000__0000015DD1D3C809-0000016143292911":{"file_size":282025984,"generation":2,"shard":"0008"},"000000067F00004005000060F300050C8000-000000067F00004005000060F300050CC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000885C5-000000067F00004005000060F80100000000__000000044854EBD1-00000008B6B51879":{"file_size":253878272,"generation":2,"shard":"0008"},"000000067F00004005000060F30001407F7A-000000067F00004005000060F50100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B90000-000000067F00004005000060F70000B94000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000560000-000000067F000040050081DB430000564000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F700017405D4__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043CC000-000000067F00004005000060F300043F8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000129D29A-000000067F00004005000060F30100000000__00000057593D8169-0000005C01565329":{"file_size":110788608,"generation":2,"shard":"0008"},"000000067F00004005000060F300003F9F83-000000067F00004005000060F30000402F4A__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001940000-000000067F00004005000060F700019685CE__0000014784964B91-0000014B000D1821":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B8000-000000067F00004005000060F300043BC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000370FD1-000000067F00004005000060F60100000000__0000000D55A212C9-000000114A805939":{"file_size":232144896,"generation":2,"shard":"0008"},"000000067F00004005000060F30003849093-000000067F00004005000060F300038720A2__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100003C0432-000000067F00004005000060F20100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":262701056,"generation":2,"shard":"0008"},"000000067F00004005000060F700014F85DF-000000067F00004005000060F70001510BBE__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000253B7A3-000000067F00004005000060F50100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001404000-000000067F00004005000060FB0001408000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F942CF-000000067F00004005000060F30003FCD352__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B38000-000000067F00004005000060FB0000B58B45__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B505C8-000000067F00004005000060F80100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":226459648,"generation":2,"shard":"0008"},"000000067F00004005000060F3000612D506-000000067F00004005000060F30006166575__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000DC000-000000067F00004005000060F700000E0000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000000C000-000000067F0000400500FB3D310000018000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C329-000001BCB572C481":{"file_size":24576,"generation":19,"shard":"0008"},"000000067F00004005000060F30002828000-000000067F00004005000060F3000282C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B0000-000000067F00004005000060F300015B4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000078000-000000067F0000400500DBCED5000007C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000086E169-030000000000000000000000000000000002__000000A583FBFB91-000000A9EB8C4489":{"file_size":77471744,"generation":2,"shard":"0008"},"000000067F0000400501046F39000000BDD2-000000067F00004005010660F500000161F7__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3101FFFFFFFF-000000067F0000400500FB3D310300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00000F28ED-030000000000000000000000000000000002__000000F91FE84F08":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E9307A-000000067F00004005000060F30004EA41A5__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016D21CF-030000000000000000000000000000000002__000000698AF6E809-0000006DDB29D589":{"file_size":226353152,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001876BD-000000067F0000400500EB4A48000018F5CD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002E5B84-030000000000000000000000000000000002__000000DBD29DC248":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D8985C-000000067F00004005000060F70000DA1E38__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C28000-000000067F000040050081DB430000C3A075__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000407201D-000000067F00004005000060F300040E319D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002B3CE-000000067F00004005000060F60100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D60000-000000067F00004005000060F80100000000__000000C483D0D6B8":{"file_size":133947392,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F705D6-000000067F00004005000060F80100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":259842048,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E7A062-000000067F00004005000060F30004E9307A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006810000-000000067F00004005000060F30006814000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007D05C8-000000067F00004005000060F80100000000__00000075CC373F31-00000079F2A2F311":{"file_size":251740160,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__0000018624969469-000001880F984A29":{"file_size":40960,"generation":11,"shard":"0008"},"000000067F00004005000060FB00014D8000-000000067F00004005000060FB0001530B44__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EA8000-000000067F00004005000060FB0001EAC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000230A0C7-000000067F00004005000060F30100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":213680128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A98000-000000067F00004005000060F30000AC9024__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F72201-000000067F00004005000060F30003F7B254__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000498000-000000067F00004005016EA00C000049C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CB8000-000000067F00004005000060F30004CBC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042CC1BD-000000067F00004005000060F300042D51D6__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D310000028681-000000067F0000400500FB3D320100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000474302B-000000067F00004005000060F300047EC0CA__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003204000-000000067F00004005000060F30003278000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024020ED-000000067F00004005000060F3000240B12A__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000216C000-000000067F00004005000060F30002170000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000005DD43-000000067F00004005000060F60100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000348B45-000000067F00004005000060FB000037968A__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000778000-000000067F000040050081DB43000077C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B4000-000000067F000040050081DB43000120E409__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CCA0B9-000000067F00004005000060F30003D0B155__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D4000-000000067F00004005000060FB0000A7AF6E__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700008F0000-000000067F00004005000060F80100000000__00000089F4693119-0000008DBE2855F9":{"file_size":262905856,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA0000-000000067F00004005000060F30006CA4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E021D0-000000067F00004005000060FB0000E0AD15__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003654000-000000067F00004005000060F3000367733F__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DC0000-000000067F00004005000060F70000DE05C8__000000C824C09619-000000CC13D2E549":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F700018D85CA-000000067F00004005000060F80100000000__000001440D3D0C69-0000014784964B91":{"file_size":260775936,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EAC000-000000067F00004005000060FB0000EB8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E70000-000000067F00004005000060F30000E74000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FE621A-000000067F00004005000060F30005FFF23F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D20000-000000067F00004005000060F70000D24000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005244000-000000067F00004005000060F3000525C065__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D9001FFFFFFFF-000000067F0000400501025D900300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD4000-000000067F00004005000060F30001CE0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E77906-000000067F00004005016EA00C0000E7F7A7__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300046B41AA-000000067F00004005000060F30100000000__0000012E71CF31F9-000001334140FC21":{"file_size":199688192,"generation":2,"shard":"0008"},"000000067F000040050100D04D00000634BB-030000000000000000000000000000000002__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":173744128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA4000-000000067F00004005000060F30000CB16B6__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DDC000-000000067F00004005000060F30004DF086C__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D7F2DE-000000067F00004005000060F30005DA03A8__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A0000-000000067F00004005000060F300048A4000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100003954D3-000000067F00004005000060F20100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300043BC000-000000067F00004005000060F300043C8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1C000-000000067F00004005016EA00C0001D78000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F100000D8000-000000067F00004005000060F100000E021B__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A0282-000000067F00004005000060F300060A93B5__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021D8F8-000000067F00004005000060F20100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":88227840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000018000-000000067F00004005000060F3000001C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430000E48000-000000067F000040050081DB430000E4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E8FBC-000000067F00004005000060F300003F9F83__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004868000-000000067F00004005000060F3000486C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700013D0000-000000067F00004005000060F700013E85D1__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001203856-030000000000000000000000000000000002__0000005413AB3641-00000057593D8169":{"file_size":157130752,"generation":2,"shard":"0008"},"000000067F00004005000060F3000029C000-000000067F00004005000060F300002C4887__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005160000-000000067F00004005000060F30005164000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000001C000-000000067F0000400500FB3D310000028681__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029F90B-000000067F00004005016EA00C00002D77AE__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30003620000-000000067F00004005000060F30100000000__000000F309FCDD19-000000F6661C9241":{"file_size":249372672,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B90000-000000067F00004005000060F30003B94000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F4000-000000067F00004005000060F30000208000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001BB8000-000000067F00004005000060F30001C00FE1__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005210000-000000067F00004005000060F30005214000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002070F71-000000067F00004005000060F30002079FDE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B40000-000000067F00004005000060F30000BB103B__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000290000-000000067F00004005000060F10000298000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":264134656,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00007C7B9C-000000067F00004005016EA00C0000807A34__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001548000-000000067F00004005000060FB000154C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100005FC000-000000067F00004005000060F1000062EE46__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A0000-000000067F0000400500EE16BC00001A4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F94000-000000067F00004005016EA00C0000F98000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000290000-000000067F00004005000060F80100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":265764864,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BC0B44-000000067F00004005000060FB0001BD1689__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000337DCF2-000000067F00004005000060F30003386D10__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300045C1062-000000067F00004005000060F3000460202F__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006814000-000000067F00004005000060F30006850000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000073DFA8-000000067F00004005016EA00C000079FCFA__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000178C000-000000067F00004005016EA00C00017A8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000051D1AE-000000067F00004005000060F20100000000__00000057593D8169-0000005C01565329":{"file_size":103145472,"generation":2,"shard":"0008"},"000000067F00004005000060F300034BD86C-000000067F00004005000060F30100000000__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":95617024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000008000-000000067F00004005016EA00C000000FEA0__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F1000014C000-000000067F00004005000060F1000015F545__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000000EAB-000000067F0000400500FB3D300100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":12976128,"generation":2,"shard":"0008"},"000000067F000040050081DB430000028000-000000067F000040050081DB43000002C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BD1689-000000067F00004005000060FB0100000000__0000008625CF2891-00000089F4693119":{"file_size":223690752,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000000001__000000FF8B261599-000001048B25A8E9":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D952B0-000000067F00004005000060F30003DAE2DC__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B30000-000000067F00004005000060F70000B505C8__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000549D0A6-000000067F00004005000060F300055861F2__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000046821B-000000067F00004005000060F20100000000__00000047E31D98D1-0000004C49155071":{"file_size":266969088,"generation":2,"shard":"0008"},"000000067F00004005000060F300043C8000-000000067F00004005000060F300043CC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E720A2-000000067F00004005000060F30100000000__000000923719A971-00000096262826C9":{"file_size":141344768,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003A8000-000000067F000040050081DB4300003AC000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006AB7A6-000000067F00004005000060F700006C3D76__000000663565F8C9-000000698AF6E809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000570A19E-000000067F00004005000060F3000573B206__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AF28CB-000000067F00004005000060F30003B33945__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015CC000-000000067F00004005000060FB00015D8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A9CFB-000000067F0000400500D69D7900000D1C5F__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A30000-000000067F00004005000060F30002A34000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000047C000-000000067F00004005000060F30000498000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FFF23F-000000067F00004005000060F300060A0282__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C194-000000067F00004005016EA00C00004EF809__0000018EC67807C9-000001935283F9B9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006D64000-000000067F00004005000060F30006DC8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001340000-000000067F00004005016EA00C0001344000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000BB0000-000000067F00004005016EA00C0000BB4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000007F0F__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000114000-000000067F0000400500E3A2A1000016321A__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000578000-030000000000000000000000000000000002__0000009A24DF6768":{"file_size":107642880,"generation":2,"shard":"0008"},"000000067F00004005000060F30006798000-000000067F00004005000060F3000679C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000E021B-000000067F00004005000060F1000010043F__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000DA8000-030000000000000000000000000000000002__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":233201664,"generation":2,"shard":"0008"},"000000067F00004005000060F100004EC079-000000067F00004005000060F20100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000170C000-000000067F00004005000060F70001720000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FCD85E-000000067F00004005000060F80100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015B74FF-000000067F00004005016EA00C00015FF3A0__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000AC9024-000000067F00004005000060F30000ADA0D0__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16C40100000000-000000067F0000400500F3A25C000006C000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000F1B5B-000000067F0000400500D69D790100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":233275392,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C0C000-000000067F00004005000060F30003C257AD__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E44000-000000067F00004005000060F30000E60000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000018E4B6-000000067F00004005000060F7000019EA78__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017E8000-000000067F00004005016EA00C00017EC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003A4C09C-000000067F00004005000060F30003A6D1B3__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100000260F2-000000067F00004005000060F20100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000097BDA-000000067F00004005016EA00C00000C7A73__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400002CDD5C-030000000000000000000000000000000002__000000D31E48D7C9-000000D74E29AAD1":{"file_size":90923008,"generation":2,"shard":"0008"},"000000067F00004005000060F3000685C000-000000067F00004005000060F30006860000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C84000-000000067F00004005000060FB0001CE16ED__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CC4BC2-000000067F000040050081DB430000CD6C36__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006349DA2-000000067F00004005000060F30006382F14__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000212E160-000000067F00004005000060F30100000000__0000009DF02C1241-000000A173C00489":{"file_size":224731136,"generation":2,"shard":"0008"},"000000067F00004005000060F30001FF8691-000000067F00004005000060F30100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":256114688,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F4000-000000067F00004005000060F30006810000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700015A8000-000000067F00004005000060F700016205B5__0000012E71CF31F9-000001334140FC21":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000024000-000000067F0000400500D69D790000028000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007AE010-000000067F00004005000060F80100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000428000-000000067F00004005016EA00C000042C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001E74000-000000067F00004005000060F30001F28000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038FF04F-000000067F00004005000060F30100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":45359104,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B0FD2A-000000067F00004005016EA00C0001B4FBC9__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006858000-000000067F00004005000060F3000685C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F9A0EB-000000067F00004005000060F30002FD317C__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000808000-000000067F000040050081DB430000822079__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015DC000-000000067F00004005000060FB00015F0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000021C000-000000067F00004005000060F7000025DA3C__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000007C000-000000067F0000400500D69D7900000A8000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001EE3D-000000067F00004005000060F60100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000F4E15B-030000000000000000000000000000000002__000000C462B3C2A9-000000C824C09619":{"file_size":73662464,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F28000-000000067F00004005000060F30001F2C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001F1DA6-030000000000000000000000000000000002__00000081AA3C40F0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70001758B92-000000067F00004005000060F70001771169__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000010000-000000067F0000400500E3A2A10000017F02__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A98000-000000067F00004005000060F30002A9C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000573B206-000000067F00004005000060F300057942F4__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000860B45-030000000000000000000000000000000002__00000023FEF9F321-00000028C365FBE1":{"file_size":252788736,"generation":2,"shard":"0008"},"000000067F00004005000060F7000090B929-000000067F00004005000060F80100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014B73D-000000067F00004005000060F80100000000__000000114A805939-00000013FB921C81":{"file_size":146432000,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D3C000-000000067F00004005000060F70000D60000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001514000-000000067F00004005000060F70001528000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001764000-000000067F00004005016EA00C0001788000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001358000-000000067F00004005000060F3000135C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001594000-000000067F00004005000060FB00015C8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004AC000-000000067F00004005000060F300004B8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005610000-000000067F00004005000060F30005614000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002794000-000000067F00004005000060F300027C0000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C60000-000000067F00004005000060F30004C64000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003A0000-000000067F00004005000060F700003B85C7__0000003579F03331-0000003959DA2DE9":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F1034-030000000000000000000000000000000002__000000E4C63CFA21-000000E7C2F1B249":{"file_size":247480320,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B4000-000000067F00004005000060F300051F0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000003C77D-000000067F00004005000060F60100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010660F500000161F7-030000000000000000000000000000000002__0000010FB1BE19B9-00000113456156F1":{"file_size":64757760,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F7B254-000000067F00004005000060F30003F942CF__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004900000-000000067F00004005000060F30004904000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006F1C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A21037-000000067F00004005000060F30003A31FB6__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000DB0000-000000067F00004005000060F30000E40F86__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A60B43-000000067F00004005000060FB0001A71688__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DC8000-000000067F00004005000060F30006DCC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006E38F6-000000067F00004005000060F80100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122B1C9-000000067F00004005000060F300012442A9__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EA8000-000000067F00004005000060FB0000EAC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B5A072-000000067F00004005000060F80100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144DCA3-000000067F00004005016EA00C000151F7C5__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F600000711FF-000000067F00004005000060F60100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050EC000-000000067F00004005000060F30005138000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005260000-000000067F00004005000060F30005290FC9__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700012DE407-000000067F00004005000060F80100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F10000-000000067F00004005000060F70000F185D4__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D38000-000000067F00004005000060F70000D3C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006671F-000000067F00004005000060F60100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300059F53C6-000000067F00004005000060F30005A16504__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B08000-000000067F000040050081DB430000B4A075__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000152C000-000000067F00004005000060F70001570000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000128000-000000067F00004005000060F3000012C000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F70000E24000-000000067F00004005000060F70000E387D6__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002791D8-000000067F000040050081DB43000028B253__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000500F7-000000067F00004005000060F60100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000ABD9C4-000000067F00004005000060F80100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009CC000-000000067F000040050081DB430000A10000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700002005D2-000000067F00004005000060F80100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":261169152,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AA656E-000000067F000040050081D80C0100000000__00000081AA3C40F0":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E14000-000000067F000040050081DB430000E48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DD734C-000000067F00004005000060F30003E40000__0000011B632CC319-0000011F1A40FA69":{"file_size":261046272,"generation":2,"shard":"0008"},"000000067F0000400500D19D0300FFFFFFFF-030000000000000000000000000000000002__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":5373952,"generation":2,"shard":"0008"},"000000067F00004005000060F30001588000-000000067F00004005000060F3000158C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000AC000-000000067F0000400500DBCED500000D0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000013F89B-000000067F0000400500EB4A48000014F7AC__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005D704F-000000067F000040050081DB4300006310C9__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A14000-000000067F000040050081DB430000A18000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F574A6-000000067F00004005000060F30001FF8691__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D320100000000-000000067F0000400500FDA1F80000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B09104-000000067F00004005000060F30001B4A119__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005011035750100000000-030000000000000000000000000000000002__00000159B010F6C0":{"file_size":78626816,"generation":2,"shard":"0008"},"000000067F00004005000060F1000015F545-000000067F00004005000060F20100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000638C06D-000000067F00004005000060F300063A50CD__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000299C28F-000000067F00004005000060F300029A526C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000364C000-000000067F00004005000060F30003650000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE0000-000000067F00004005016EA00C0000CE4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000794000-000000067F000040050081DB4300007A8000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A18000-000000067F000040050081DB430000A1C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000000C000-000000067F00004005000060F30000018000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB4300000D40FF-030000000000000000000000000000000002__00000075CC373F31-00000079F2A2F311":{"file_size":78061568,"generation":2,"shard":"0008"},"000000067F00004005000060F60000099FD8-000000067F00004005000060F60100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000330A1C8-000000067F00004005000060F3000332B1B6__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FA900D-000000067F00004005000060F30006FDA081__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000148AC30-000000067F00004005000060FB000149B774__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000EF1FC3-000000067F00004005000060F50100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006A7C566-000000067F00004005000060F30100000000__00000178B8B10551-0000017C9F5597E1":{"file_size":173072384,"generation":2,"shard":"0008"},"000000067F00004005000060FB000104B856-000000067F00004005000060FB000107C39B__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000030000-000000067F00004005000060F80100000000__000000021DC73119-000000044854EBD1":{"file_size":261341184,"generation":2,"shard":"0008"},"000000067F00004005000060F30003580FD3-000000067F00004005000060F30100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":228188160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001224000-000000067F00004005000060F70001232ACF__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300022B9050-000000067F00004005000060F3000230A0C7__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006654000-000000067F00004005000060F30006670000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D0000-000000067F00004005000060F700010D85CF__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000FD8000-030000000000000000000000000000000002__000000C824C09619-000000CC13D2E549":{"file_size":237559808,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F0000-000000067F00004005000060FB00015F4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60100000000-000000067F00004005000060F70000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000DA1E38-000000067F00004005000060F80100000000__000000C462B3C2A9-000000C824C09619":{"file_size":209821696,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D76250-000000067F00004005000060F30005D7F2DE__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000418000-000000067F00004005000060F10100000000__00000044B4679349-00000047E31D98D1":{"file_size":269148160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B61000-000000067F00004005000060F80100000000__0000018613F0A050":{"file_size":65150976,"generation":3,"shard":"0008"},"000000067F00004005000060F300008C8000-000000067F00004005000060F300008E0F49__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002D8000-030000000000000000000000000000000002__0000008625CF2891-00000089F4693119":{"file_size":231907328,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C04000-000000067F00004005000060FB0000C08000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001808000-000000067F00004005000060FB000180C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A30379-030000000000000000000000000000000002__000000AFE87558B0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D85CF-000000067F00004005000060F80100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":164970496,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C70000-000000067F00004005000060FB0000C74000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001188000-000000067F00004005016EA00C000118C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000CB85B3-000000067F00004005000060F70000CC8B74__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A1D870-000000067F00004005000060F30004A2693B__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008CF772-000000067F00004005016EA00C00008E760F__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D34000-000000067F00004005016EA00C0000D5D1E9__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014B79E7-000000067F00004005016EA00C00014CF88D__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300040E319D-000000067F00004005000060F300040F41F4__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FF427D-000000067F00004005000060F30100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":156073984,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E0A466-000000067F00004005000060F30005E3B48F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700005F9158-000000067F00004005000060F80100000000__00000057593D8169-0000005C01565329":{"file_size":230768640,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E4000-000000067F00004005016EA00C000193189A__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30005F0202C-000000067F00004005000060F30005F3303F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000148000-000000067F00004005000060F1000014C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C0000-000000067F00004005000060F300060C4000__0000016E41E03CA0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C9C000-000000067F00004005000060FB0000CC6E51__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54700000A0EB1-000000067F000040050109CD330100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004EC000-000000067F00004005016EA00C00005A0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000A9F465-000000067F00004005016EA00C0000ACF305__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000208000-000000067F00004005000060F3000020C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000011E137-000000067F0000400500F67839000003E09B__000001048B25A8E9-0000010779A7F551":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30000402F4A-000000067F00004005000060F60100000000__000000114A805939-00000013FB921C81":{"file_size":166469632,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004A8000-000000067F00004005016EA00C00004AC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001968000-000000067F00004005000060F7000196C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EF8000-000000067F00004005000060F30006EFC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000BB4000-000000067F00004005016EA00C0000C20000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009C0000-000000067F00004005000060F80100000000__0000009A24DF6768":{"file_size":37371904,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C84000-000000067F00004005000060F30004CB8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002514000-000000067F00004005000060F30002530000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DE05C8-000000067F00004005000060F80100000000__000000C824C09619-000000CC13D2E549":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F301FFFFFFFF-000000067F00004005000060F30300000000__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30001886B2A-000000067F00004005000060F50100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700006A8000-000000067F00004005000060F80100000000__000000636DE92159-000000663565F8C9":{"file_size":117022720,"generation":2,"shard":"0008"},"000000067F00004005000060FB000154C000-000000067F00004005000060FB0001558000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053F40CC-000000067F00004005000060F30100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":223453184,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C95225-000000067F00004005000060F30005C9E3C4__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000558C000-000000067F00004005000060F30005598000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FFA699-000000067F00004005000060F50100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F1C000-000000067F00004005000060F50100000000__000001848D082B20":{"file_size":24117248,"generation":2,"shard":"0008"},"000000067F00004005000060F3000486C000-000000067F00004005000060F30004878000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300019C2056-000000067F00004005000060F300019F31AA__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000004C000-000000067F0000400500EE16BC0000060000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000046EAB9-000000067F00004005000060F80100000000__000000417D21ACF9-00000044B4679349":{"file_size":48717824,"generation":2,"shard":"0008"},"000000067F000040050081DB430000790000-000000067F000040050081DB430000794000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000002C000-000000067F0000400500D69D790000078000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000026C90-000000067F00004005000060F60100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000738000-000000067F00004005000060F3000073C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000204000-000000067F00004005000060F10000218000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000177E20-000000067F0000400500C782E400001AFD31__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000048C000-000000067F00004005000060F700004B1E77__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015F8000-000000067F00004005000060F50100000000__000000698F2C3A38":{"file_size":131276800,"generation":2,"shard":"0008"},"000000067F00004005000060F30000428000-000000067F00004005000060F3000042C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000038C000-000000067F000040050081DB430000390000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000102A1CE-000000067F00004005000060FB000103AD12__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001848000-000000067F00004005000060FB000184C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001DC000-000000067F00004005000060FB0000228000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D4000-000000067F00004005016EA00C0001228000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000011775B-030000000000000000000000000000000002__0000018820A34650":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F700011B8000-000000067F00004005000060F80100000000__000001048B25A8E9-0000010779A7F551":{"file_size":263897088,"generation":2,"shard":"0008"},"000000067F00004005000060F3000660D31F-000000067F00004005000060F3000664E3CA__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000064000-000000067F0000400500EE16BC00000F28ED__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000525C065-000000067F00004005000060F50100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A7F98F-000000067F00004005000060F30100000000__000001398B56A519-0000013C9C0E3339":{"file_size":47595520,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004369C-000000067F000040050100D04D000004B5AD__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001A6E2-000000067F00004005000060F60100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700004405CF-000000067F00004005000060F80100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":198836224,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D28000-000000067F00004005000060F30002D2C000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F56D510100000000-000000067F0000400500F67839000003C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E387D6-000000067F00004005000060F80100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000213C000-000000067F00004005000060F30002168000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060D4415-000000067F00004005000060F3000612D506__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3100000546CB-000000067F0000400500FB3D320100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000D18CA9-030000000000000000000000000000000002__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":210288640,"generation":2,"shard":"0008"},"000000067F00004005000060F60000062E4F-000000067F00004005000060F60100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000016A065-000000067F0000400500F3A25C000017C0CB__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AD0000-000000067F00004005000060FB0001B28B44__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000254000-000000067F00004005000060F30000298000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E8C000-000000067F000040050081DB430000EA0000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300040F41F4-000000067F00004005000060F3000412D27C__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00013B8000-000000067F00004005000060FB00013BC000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000D8000-000000067F00004005000060F700000DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000958000-000000067F00004005000060F700009605D8__000000923719A971-00000096262826C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A0000-000000067F00004005000060FB00004A4000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700001213F2-000000067F00004005000060F80100000000__0000000D55A212C9-000000114A805939":{"file_size":55320576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004156457-000000067F00004005000060F30100000000__00000122A7BB7B29-0000012694E36301":{"file_size":96927744,"generation":2,"shard":"0008"},"000000067F00004005000060F30003278000-000000067F00004005000060F3000327C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000158F667-000000067F00004005016EA00C00015B74FF__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001D50000-000000067F00004005000060FB0001D88B43__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000054AE8-000000067F00004005000060F60100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300002C4887-000000067F00004005000060F60100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B34000-000000067F00004005000060F70001B5A072__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000416A8-000000067F00004005000060F60100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F10000050000-000000067F00004005000060F10000058000__000000044854EBD1-00000008B6B51879":{"file_size":264011776,"generation":2,"shard":"0008"},"000000067F00004005000060F300043FC000-000000067F00004005000060F300044D3639__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004878000-000000067F00004005000060F3000487C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000396C000-000000067F00004005000060F30003998000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019F7907-000000067F00004005016EA00C0001A477A4__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268443648,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014D7727-000000067F00004005016EA00C00014E75C6__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00016570D9-030000000000000000000000000000000002__000001AC25760149-000001AFC313C819":{"file_size":86335488,"generation":11,"shard":"0008"},"000000067F00004005000060F70001270000-000000067F00004005000060F80100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":265363456,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003BFD31-000000067F0000400500EB4A4800003C7C42__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B31F8-000000067F00004005000060F300014CC16D__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D5D1E9-030000000000000000000000000000000002__0000019E7001E460":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F100003B8214-000000067F00004005000060F100003C0432__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001346854-000000067F00004005016EA00C000135FCAD__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000160410C-000000067F00004005000060F3000165515A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000118B12B-030000000000000000000000000000000002__00000054161C34B8":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF0000-000000067F00004005000060F30006DF4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C4000-000000067F00004005000060F700003FE341__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000FF0000-000000067F00004005000060F30100000000__0000004C49155071-0000004F31878919":{"file_size":256286720,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F4000-000000067F00004005000060FB00015FCD31__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005816253-000000067F00004005000060F30005847319__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002460000-000000067F00004005000060F30002464000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000113A337-000000067F00004005000060F700011528FB__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000037968A-030000000000000000000000000000000002__0000000D55A212C9-000000114A805939":{"file_size":226426880,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000128000-000000067F00004005016EA00C000012FE9A__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000036FF11-000000067F0000400500EB4A4800003A7E20__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000658113F-000000067F00004005000060F3000659A203__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D18000-000000067F00004005016EA00C0001D1C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001A44000-000000067F00004005000060F30001AB1583__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000138000-000000067F00004005000060F1000013C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300009BC000-000000067F00004005000060F30000A50000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000110E30C-000000067F00004005000060F80100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F50100000000-000000067F00004005000060F60000014000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006FA900D__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001D88B43-000000067F00004005000060FB0100000000__0000008DBE2855F9-000000923719A971":{"file_size":249028608,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122A1D5-000000067F00004005000060F30100000000__0000005413AB3641-00000057593D8169":{"file_size":48783360,"generation":2,"shard":"0008"},"000000067F00004005000060F30006277C61-000000067F00004005000060F30006320C60__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000388000-000000067F000040050081DB43000038C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E67A6E-000000067F00004005016EA00C0000E77906__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300009B8000-000000067F00004005000060F300009BC000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D900000068000-000000067F00004005010450640000000570__0000010FB1BE19B9-00000113456156F1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00002D4B6A-030000000000000000000000000000000002__0000000D80565628":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E50FF3-000000067F00004005000060F30001E720A2__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A4000-000000067F00004005016EA00C0000670000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000C18000-000000067F00004005000060FB0000C1C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BA4F5B-000000067F00004005000060F70000BBD532__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AC115C-000000067F00004005000060F80100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":237248512,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D24000-000000067F00004005000060F30004DA8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA4000-000000067F00004005000060F30006D10000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001433D0-030000000000000000000000000000000002__000000FCCD5238B1-000000FF8B261599":{"file_size":146407424,"generation":2,"shard":"0008"},"000000067F00004005000060F3000165515A-000000067F00004005000060F30100000000__000000698AF6E809-0000006DDB29D589":{"file_size":112680960,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000118C000-000000067F00004005016EA00C00011D0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB43000094A076-030000000000000000000000000000000002__000000A9EB8C4489-000000ACA44C8E99":{"file_size":176054272,"generation":2,"shard":"0008"},"000000067F00004005000060F70001528000-000000067F00004005000060F7000152C000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C82B50-000000067F000040050081DB430000CC4BC2__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EF15A-000000067F000040050081DB4300002791D8__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000125BF2-000000067F00004005000060F20100000000__000000114A805939-00000013FB921C81":{"file_size":78782464,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E40F86-000000067F00004005000060F30100000000__000000417D21ACF9-00000044B4679349":{"file_size":111108096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF0000-000000067F00004005016EA00C0000FF4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000CB16B6-000000067F00004005000060F50100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001990000-000000067F00004005000060F70001994000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A54000-000000067F00004005000060F30000A5F9BB__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300061B8705-000000067F00004005000060F300061D9774__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000084C000-000000067F00004005000060F70000858000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000848000-000000067F00004005000060F7000084C000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D18000-000000067F00004005000060F30001D79136__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001558000-000000067F00004005000060FB000155C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024440AE-000000067F00004005000060F3000244D189__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002CFC020-000000067F00004005000060F30100000000__000000C824C09619-000000CC13D2E549":{"file_size":150708224,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A4A074-000000067F000040050081DB430000A640EA__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C98000-000000067F00004005000060FB0000C9C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001840000-000000067F00004005000060FB0001844000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000802123-000000067F00004005000060F30000853115__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000029ED0-000000067F00004005000060F80100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C00003E4000-000000067F00004005016EA00C00003E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CBC000-000000067F00004005000060F30004D20000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000122C000-000000067F00004005016EA00C0001240000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004DF086C-000000067F00004005000060F50100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050B5199-000000067F00004005000060F30100000000__0000014784964B91-0000014B000D1821":{"file_size":126124032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001A477A4-000000067F00004005016EA00C0001ADF63C__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70001828000-000000067F00004005000060F7000182C000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004F0000-000000067F00004005000060F10000518222__0000005413AB3641-00000057593D8169":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EFD576-000000067F00004005000060F30100000000__00000164DEE06671-0000016834A3FC91":{"file_size":193077248,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50100000000-000000067F0000400500FA2AD30000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000258E3A9-000000067F00004005000060F3000259F4A3__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C90000-000000067F00004005000060F70000CB85B3__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000114C000-000000067F00004005000060FB000118B12B__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003644000-000000067F00004005000060F30003648000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A50000-000000067F00004005000060FB0001A60B43__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C257AD-000000067F00004005000060F50100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002DE8000-000000067F00004005000060F30002E4104A__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000C8000-000000067F0000400500F3A25C00000EA069__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002174000-000000067F00004005000060F30002210000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014D5280-000000067F00004005000060F300014E6333__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000332B1B6-000000067F00004005000060F30003344134__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065F42B4-000000067F00004005000060F3000660D31F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E264A-000000067F000040050081DB4300010F46BD__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300069D13FA-000000067F00004005000060F300069FA3F6__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300061D9774-000000067F00004005000060F30006222843__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C821A-000000067F00004005000060F20100000000__000000601F43CF09-000000636DE92159":{"file_size":265183232,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000200000-000000067F0000400500EB4A480000204000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001690000-000000067F00004005000060F70100000000__000001334140FC21-00000137115BE4D9":{"file_size":273965056,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000A575C7-000000067F00004005016EA00C0000A9F465__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001E6C000-000000067F00004005000060FB0001E98000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014195A7-000000067F00004005000060FB000147A0EC__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AE168A-030000000000000000000000000000000002__0000003203FB5749-0000003579F03331":{"file_size":223379456,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA0000-000000067F00004005000060F30000CA4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E4000-000000067F00004005000060F30000738000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E0000-000000067F00004005000060F300006E4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001124000-000000067F00004005000060FB0001148000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A8000-000000067F0000400500D69D7900000AC000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000130000-000000067F0000400500C782E40000137F10__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000020FBCF-000000067F00004005016EA00C0000257A6F__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001B28B44-000000067F00004005000060FB0100000000__0000008196C976A1-0000008625CF2891":{"file_size":249454592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001120000-000000067F00004005000060FB0001124000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005474062-000000067F00004005000060F3000549D0A6__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000023FA62-030000000000000000000000000000000002__000000D01F399709-000000D31E48D7C9":{"file_size":245366784,"generation":2,"shard":"0008"},"000000067F000040050081DB430000160484-030000000000000000000000000000000002__00000079F2A2F311-0000007E3A9BFD29":{"file_size":226582528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A4FB4-000000067F00004005000060F300038B5F5B__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017E8000-000000067F00004005000060F300017EC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D31000000C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010105DB-000000067F00004005000060F80100000000__000000E4C63CFA21-000000E7C2F1B249":{"file_size":254935040,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858570-000000067F00004005000060F80100000000__0000008196C976A1-0000008625CF2891":{"file_size":252985344,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D4000-000000067F000040050081DB4300001E8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00005E0000-000000067F00004005000060FB0000638B45__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050107B547000006C000-000000067F000040050107B54700000A0EB1__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000430000-000000067F00004005000060FB0000434000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014E6333-000000067F00004005000060F3000151F271__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D300300000000__00000117EDA82C11-0000011B632CC319":{"file_size":65536,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BE7584-000000067F00004005000060F30100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":58204160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001068000-000000067F00004005000060F80100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":168730624,"generation":2,"shard":"0008"},"000000067F00004005000060F1000013C000-000000067F00004005000060F10000148000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000659A203-000000067F00004005000060F300065BB235__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000EC0000-000000067F00004005000060F70000EF85D6__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005010660F500000B4000-000000067F00004005010660F500000F44CB__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A4000-000000067F00004005000060F300067F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F0000-000000067F0000400500DBCED500000F4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000768000-000000067F000040050081DB43000076C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E0000-000000067F00004005016EA00C00018E4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000A50000-000000067F00004005000060F30000A54000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E68000-000000067F00004005000060FB0001E6C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001960000-000000067F00004005000060F300019790A2__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6A1D0-000000067F00004005000060FB0000BAAD15__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E4A157-000000067F00004005000060F30002E630CF__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E70000-000000067F00004005000060F30006E74000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004464DD-000000067F00004005000060F7000046EAB9__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000204000-000000067F0000400500EB4A480000218000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042D51D6-000000067F00004005000060F3000430E1E9__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F30000-000000067F00004005000060FB0100000000__00000047E31D98D1-0000004C49155071":{"file_size":272302080,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006F8000-030000000000000000000000000000000002__0000009DF02C1241-000000A173C00489":{"file_size":235110400,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EC000-000000067F000040050081DB4300001F1DA6__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A3082-000000067F00004005000060F30100000000__000001048B25A8E9-0000010779A7F551":{"file_size":76644352,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000400000-000000067F00004005016EA00C0000404000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003481DDB-000000067F00004005000060F30100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":107814912,"generation":2,"shard":"0008"},"000000067F00004005000060F3000489C000-000000067F00004005000060F300048A0000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CD6C36-000000067F000040050081DB430000D18CA9__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004888000-000000067F00004005000060F3000488C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300008E0F49-000000067F00004005000060F30000921E8A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000074000-000000067F0000400500C782E400000A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011F2D11-000000067F00004005000060FB0001203856__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300046330B1-000000067F00004005000060F300046B41AA__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003548000-000000067F00004005000060F30003580FD3__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001198B44-000000067F00004005000060FB00011C1688__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000049C000-000000067F00004005000060F300004A8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B44000-000000067F00004005016EA00C0000BB0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700014F0000-000000067F00004005000060F700014F85DF__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C5E15B-000000067F000040050081DB430000C801D1__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A10000-000000067F00004005000060F30003A21037__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EFC000-000000067F00004005000060F30006F18000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1F87B-000000067F00004005016EA00C0001D7F71A__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060F30002A34000-000000067F00004005000060F30002A40000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F0AA88-000000067F00004005000060F80100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006700000-000000067F00004005000060F30006704000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CC4000-000000067F00004005000060F30001CD0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858000-000000067F00004005000060F80100000000__00000081AA3C40F0":{"file_size":48439296,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000D6407-000000067F000040050081DB430000160484__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300057DD292-000000067F00004005000060F30005816253__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006222843-000000067F00004005000060F3000625B8F0__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000410000-000000067F00004005000060FB0000430B46__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100006A8000-000000067F00004005000060F100006B0000__0000006DDB29D589-000000722F474369":{"file_size":264110080,"generation":2,"shard":"0008"},"000000067F00004005000060F3000460202F-000000067F00004005000060F300046330B1__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E74000-000000067F00004005000060F30006EF8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A3B020-000000067F00004005000060F30003A4C09C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002535462-000000067F00004005000060F3000258E3A9__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000294000-000000067F0000400500EB4A480000355928__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016E85370000000000-030000000000000000000000000000000002__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":152190976,"generation":2,"shard":"0008"},"000000067F00004005000060F3000158C000-000000067F00004005000060F300015B0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003386D10-000000067F00004005000060F300033D7D7C__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E7C000-000000067F00004005000060F30000EF1FC3__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000030000-000000067F0000400500FA2AD30000034000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005588000-000000067F00004005000060F3000558C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A0000-000000067F00004005000060F300039A4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008A13D-000000067F00004005000060F60100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017120CE-000000067F00004005000060FB000172AC12__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003200000-000000067F00004005000060F30003204000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300007C1007-000000067F00004005000060F30000802123__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000006C000-000000067F0000400500F3A25C00000BB439__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B4000-000000067F00004005000060F300015F8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C220F-000000067F00004005000060F300060CB2C8__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A5000004A25C-000000067F0000400500F8E3A50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C9AFB8-000000067F00004005000060F30002CFC020__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F2BD40100000000-000000067F00004005010F44EB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002AEED02-000000067F00004005000060F50100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002EB8000-000000067F00004005000060F30002F5105E__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A1000016321A-030000000000000000000000000000000002__000000EFDE07FFD8":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000135C000-000000067F00004005000060F30001407F7A__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F67839000006AEF4-000000067F0000400500F7D2DD0100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DA03A8-000000067F00004005000060F30005DC93F1__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E2072-000000067F000040050081DB430100000000__000000D01F399709-000000D31E48D7C9":{"file_size":15392768,"generation":2,"shard":"0008"},"000000067F00004005000060F300004A8000-000000067F00004005000060F300004AC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016E0A44-000000067F00004005000060FB0001701588__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300024D8000-000000067F00004005000060F300024DC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BC8000-000000067F00004005000060F30003BCC000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F00100000000-000000067F00004005000060F10000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430100000000-000000067F0000400500C782E40000074000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D14206-000000067F00004005000060F30003D252C8__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700006479E7-000000067F00004005000060F80100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B9C988-000000067F00004005000060F70000BA4F5B__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000078000-000000067F0000400500D69D79000007C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CC8B74-000000067F00004005000060F80100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":95657984,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000708000-000000067F00004005000060FB000070C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-000000067F000040050081DB430000EEA075__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000001FD3E-000000067F00004005016EA00C0000097BDA__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000689E295-000000067F00004005000060F3000690F2FD__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CE0000-000000067F00004005000060F30000D31030__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-030000000000000000000000000000000002__000000C483D0D6B8":{"file_size":20307968,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000807A34-000000067F00004005016EA00C00008578D4__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430001060000-000000067F000040050081DB430001064000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480F32C-000000067F00004005000060F3000486837F__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700009385D4-000000067F00004005000060F80100000000__0000008DBE2855F9-000000923719A971":{"file_size":252207104,"generation":2,"shard":"0008"},"000000067F00004005000060F30000090000-000000067F00004005000060F300000C1095__000000021DC73119-000000044854EBD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480620C-000000067F00004005000060F3000480F32C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FA40AD-000000067F00004005000060F30005FC519A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014A42B8-030000000000000000000000000000000002__000000601F43CF09-000000636DE92159":{"file_size":137322496,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD0000-000000067F00004005000060F30001CD4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000404000-000000067F00004005016EA00C0000428000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002079FDE-000000067F00004005000060F300020830BE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000487C000-000000067F00004005000060F30004880000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010A188401FFFFFFFF-000000067F00004005010A18840300000000__00000137115BE4D9-000001398B56A519":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000218000-000000067F00004005000060F7000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EF454F-000000067F00004005000060F30005EFD576__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DC93F1-000000067F00004005000060F30005E0A466__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"}},"disk_consistent_lsn":"1BC/B5734CD8","metadata_bytes":{"disk_consistent_lsn":"1BC/B5734CD8","prev_record_lsn":"1BC/B5734CB0","ancestor_timeline":null,"ancestor_lsn":"0/0","latest_gc_cutoff_lsn":"1BC/B5732690","initdb_lsn":"0/14EE150","pg_version":16},"lineage":{}} diff --git a/patches/rum.patch b/patches/rum.patch new file mode 100644 index 0000000000..3041f8df81 --- /dev/null +++ b/patches/rum.patch @@ -0,0 +1,54 @@ +commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb +Author: Anastasia Lubennikova +Date: Mon Jul 15 12:31:56 2024 +0100 + + Neon: fix unlogged index build patch + +diff --git a/src/ruminsert.c b/src/ruminsert.c +index e8b209d..e89bf2a 100644 +--- a/src/ruminsert.c ++++ b/src/ruminsert.c +@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(index->rd_smgr); ++#endif ++ + initRumState(&buildstate.rumstate, index); + buildstate.rumstate.isBuild = true; + buildstate.indtuples = 0; +@@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); + rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(index->rd_smgr); ++#endif ++ + /* + * Write index to xlog + */ +@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + UnlockReleaseBuffer(buffer); + } + ++#ifdef NEON_SMGR ++ { ++#if PG_VERSION_NUM >= 160000 ++ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator; ++#else ++ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; ++#endif ++ ++ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ ++ smgr_end_unlogged_build(index->rd_smgr); ++ } ++#endif ++ + /* + * Return statistics + */ diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index cd316dbb91..3b755bb042 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -6,6 +6,7 @@ OBJS = \ $(WIN32RES) \ extension_server.o \ file_cache.o \ + hll.o \ libpagestore.o \ neon.o \ neon_utils.o \ @@ -22,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql +DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 93252e6b29..de023da5c4 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -45,6 +45,7 @@ static const char *jwt_token = NULL; /* GUCs */ static char *ConsoleURL = NULL; static bool ForwardDDL = true; +static bool RegressTestMode = false; /* * CURL docs say that this buffer must exist until we call curl_easy_cleanup @@ -802,6 +803,14 @@ NeonProcessUtility( case T_DropRoleStmt: HandleDropRole(castNode(DropRoleStmt, parseTree)); break; + case T_CreateTableSpaceStmt: + if (!RegressTestMode) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("CREATE TABLESPACE is not supported on Neon"))); + } + break; default: break; } @@ -864,6 +873,18 @@ InitControlPlaneConnector() NULL, NULL); + DefineCustomBoolVariable( + "neon.regress_test_mode", + "Controls whether we are running in the regression test mode", + NULL, + &RegressTestMode, + false, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); if (!jwt_token) { diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 25275ef31f..479209a537 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -26,7 +26,6 @@ #include "miscadmin.h" #include "pagestore_client.h" #include "common/hashfn.h" -#include "lib/hyperloglog.h" #include "pgstat.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR @@ -40,6 +39,10 @@ #include "utils/dynahash.h" #include "utils/guc.h" +#include "hll.h" + +#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) + /* * Local file cache is used to temporary store relations pages in local file system. * All blocks of all relations are stored inside one file and addressed using shared hash map. @@ -50,20 +53,43 @@ * * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about * its consistency. + + * + * ## Holes + * + * The LFC can be resized on the fly, up to a maximum size that's determined + * at server startup (neon.max_file_cache_size). After server startup, we + * expand the underlying file when needed, until it reaches the soft limit + * (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink + * the LFC by punching holes in the underlying file with a + * fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't + * shrink, but the disk space it uses does. + * + * Each hole is tracked by a dummy FileCacheEntry, which are kept in the + * 'holes' linked list. They are entered into the chunk hash table, with a + * special key where the blockNumber is used to store the 'offset' of the + * hole, and all other fields are zero. Holes are never looked up in the hash + * table, we only enter them there to have a FileCacheEntry that we can keep + * in the linked list. If the soft limit is raised again, we reuse the holes + * before extending the nominal size of the file. */ /* Local file storage allocation chunk. - * Should be power of two and not less than 32. Using larger than page chunks can + * Should be power of two. Using larger than page chunks can * 1. Reduce hash-map memory footprint: 8TB database contains billion pages * and size of hash entry is 40 bytes, so we need 40Gb just for hash map. * 1Mb chunks can reduce hash map size to 320Mb. * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed */ #define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ +/* + * Smaller chunk seems to be better for OLTP workload + */ +// #define BLOCKS_PER_CHUNK 8 /* 64kb chunk */ #define MB ((uint64)1024*1024) -#define HYPER_LOG_LOG_BIT_WIDTH 10 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) +#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32) typedef struct FileCacheEntry { @@ -71,8 +97,8 @@ typedef struct FileCacheEntry uint32 hash; uint32 offset; uint32 access_count; - uint32 bitmap[BLOCKS_PER_CHUNK / 32]; - dlist_node lru_node; /* LRU list node */ + uint32 bitmap[CHUNK_BITMAP_SIZE]; + dlist_node list_node; /* LRU/holes list node */ } FileCacheEntry; typedef struct FileCacheControl @@ -87,8 +113,8 @@ typedef struct FileCacheControl uint64 writes; dlist_head lru; /* double linked list for LRU replacement * algorithm */ - hyperLogLogState wss_estimation; /* estimation of wroking set size */ - uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1]; + dlist_head holes; /* double linked list of punched holes */ + HyperLogLogState wss_estimation; /* estimation of working set size */ } FileCacheControl; static HTAB *lfc_hash; @@ -136,6 +162,7 @@ lfc_disable(char const *op) lfc_ctl->used = 0; lfc_ctl->limit = 0; dlist_init(&lfc_ctl->lru); + dlist_init(&lfc_ctl->holes); if (lfc_desc > 0) { @@ -215,18 +242,18 @@ lfc_shmem_startup(void) if (!found) { int fd; - uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size); + uint32 n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size); lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock"); info.keysize = sizeof(BufferTag); info.entrysize = sizeof(FileCacheEntry); /* - * lfc_size+1 because we add new element to hash table before eviction + * n_chunks+1 because we add new element to hash table before eviction * of victim */ lfc_hash = ShmemInitHash("lfc_hash", - lfc_size + 1, lfc_size + 1, + n_chunks + 1, n_chunks + 1, &info, HASH_ELEM | HASH_BLOBS); lfc_ctl->generation = 0; @@ -236,14 +263,10 @@ lfc_shmem_startup(void) lfc_ctl->misses = 0; lfc_ctl->writes = 0; dlist_init(&lfc_ctl->lru); + dlist_init(&lfc_ctl->holes); /* Initialize hyper-log-log structure for estimating working set size */ - initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH); - - /* We need hashes in shared memory */ - pfree(lfc_ctl->wss_estimation.hashesArr); - memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); - lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes; + initSHLL(&lfc_ctl->wss_estimation); /* Recreate file cache on restart */ fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); @@ -316,14 +339,31 @@ lfc_change_limit_hook(int newval, void *extra) * Shrink cache by throwing away least recently accessed chunks and * returning their space to file system */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); + FileCacheEntry *hole; + uint32 offset = victim->offset; + uint32 hash; + bool found; + BufferTag holetag; - Assert(victim->access_count == 0); + CriticalAssert(victim->access_count == 0); #ifdef FALLOC_FL_PUNCH_HOLE if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0) neon_log(LOG, "Failed to punch hole in file: %m"); #endif + /* We remove the old entry, and re-enter a hole to the hash table */ hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); + + memset(&holetag, 0, sizeof(holetag)); + holetag.blockNum = offset; + hash = get_hash_value(lfc_hash, &holetag); + hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found); + hole->hash = hash; + hole->offset = offset; + hole->access_count = 0; + CriticalAssert(!found); + dlist_push_tail(&lfc_ctl->holes, &hole->list_node); + lfc_ctl->used -= 1; } lfc_ctl->limit = new_size; @@ -415,6 +455,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_SHARED); @@ -446,6 +488,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) tag.forkNum = forkNum; tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1)); + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -476,7 +519,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) { bool has_remaining_pages; - for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) + for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) { if (entry->bitmap[i] != 0) { @@ -491,8 +534,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) */ if (!has_remaining_pages) { - dlist_delete(&entry->lru_node); - dlist_push_head(&lfc_ctl->lru, &entry->lru_node); + dlist_delete(&entry->list_node); + dlist_push_head(&lfc_ctl->lru, &entry->list_node); } } @@ -531,6 +574,8 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -545,7 +590,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* Approximate working set */ tag.blockNum = blkno; - addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) { @@ -557,7 +602,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } /* Unlink entry from LRU list to pin it for the duration of IO operation */ if (entry->access_count++ == 0) - dlist_delete(&entry->lru_node); + dlist_delete(&entry->list_node); generation = lfc_ctl->generation; entry_offset = entry->offset; @@ -575,12 +620,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (lfc_ctl->generation == generation) { - Assert(LFC_ENABLED()); + CriticalAssert(LFC_ENABLED()); lfc_ctl->hits += 1; pgBufferUsage.file_cache.hits += 1; - Assert(entry->access_count > 0); + CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); } else result = false; @@ -619,6 +664,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CopyNRelFileInfoToBufTag(tag, rinfo); + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -638,7 +685,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void * operation */ if (entry->access_count++ == 0) - dlist_delete(&entry->lru_node); + dlist_delete(&entry->list_node); } else { @@ -661,13 +708,26 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) { /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); - Assert(victim->access_count == 0); + CriticalAssert(victim->access_count == 0); entry->offset = victim->offset; /* grab victim's chunk */ hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); neon_log(DEBUG2, "Swap file cache page"); } + else if (!dlist_is_empty(&lfc_ctl->holes)) + { + /* We can reuse a hole that was left behind when the LFC was shrunk previously */ + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool found; + + hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); + CriticalAssert(found); + + lfc_ctl->used += 1; + entry->offset = offset; /* reuse the hole */ + } else { lfc_ctl->used += 1; @@ -695,11 +755,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void if (lfc_ctl->generation == generation) { - Assert(LFC_ENABLED()); + CriticalAssert(LFC_ENABLED()); /* Place entry to the head of LRU list */ - Assert(entry->access_count > 0); + CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31)); } @@ -714,7 +774,6 @@ typedef struct } NeonGetStatsCtx; #define NUM_NEON_GET_STATS_COLS 2 -#define NUM_NEON_GET_STATS_ROWS 3 PG_FUNCTION_INFO_V1(neon_get_lfc_stats); Datum @@ -750,7 +809,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) INT8OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); - funcctx->max_calls = NUM_NEON_GET_STATS_ROWS; funcctx->user_fctx = fctx; /* Return to original context when allocating transient memory */ @@ -784,6 +842,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) if (lfc_ctl) value = lfc_ctl->writes; break; + case 4: + key = "file_cache_size"; + if (lfc_ctl) + value = lfc_ctl->size; + break; default: SRF_RETURN_DONE(funcctx); } @@ -907,7 +970,7 @@ local_cache_pages(PG_FUNCTION_ARGS) hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { - for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++) + for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) n_pages += pg_popcount32(entry->bitmap[i]); } } @@ -986,20 +1049,38 @@ local_cache_pages(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds); + +Datum +approximate_working_set_size_seconds(PG_FUNCTION_ARGS) +{ + if (lfc_size_limit != 0) + { + int32 dc; + time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0); + LWLockAcquire(lfc_lock, LW_SHARED); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration); + LWLockRelease(lfc_lock); + PG_RETURN_INT32(dc); + } + PG_RETURN_NULL(); +} + PG_FUNCTION_INFO_V1(approximate_working_set_size); Datum approximate_working_set_size(PG_FUNCTION_ARGS) { - int32 dc = -1; if (lfc_size_limit != 0) { + int32 dc; bool reset = PG_GETARG_BOOL(0); LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); - dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1); if (reset) - memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); + memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); LWLockRelease(lfc_lock); + PG_RETURN_INT32(dc); } - PG_RETURN_INT32(dc); + PG_RETURN_NULL(); } diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c new file mode 100644 index 0000000000..f8496b3125 --- /dev/null +++ b/pgxn/neon/hll.c @@ -0,0 +1,193 @@ +/*------------------------------------------------------------------------- + * + * hll.c + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "postgres.h" +#include "funcapi.h" +#include "port/pg_bitutils.h" +#include "utils/timestamp.h" +#include "hll.h" + + +#define POW_2_32 (4294967296.0) +#define NEG_POW_2_32 (-4294967296.0) + +#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS) + +/* + * Worker for addHyperLogLog(). + * + * Calculates the position of the first set bit in first b bits of x argument + * starting from the first, reading from most significant to least significant + * bits. + * + * Example (when considering fist 10 bits of x): + * + * rho(x = 0b1000000000) returns 1 + * rho(x = 0b0010000000) returns 3 + * rho(x = 0b0000000000) returns b + 1 + * + * "The binary address determined by the first b bits of x" + * + * Return value "j" used to index bit pattern to watch. + */ +static inline uint8 +rho(uint32 x, uint8 b) +{ + uint8 j = 1; + + if (x == 0) + return b + 1; + + j = 32 - pg_leftmost_one_pos32(x); + + if (j > b) + return b + 1; + + return j; +} + +/* + * Initialize HyperLogLog track state + */ +void +initSHLL(HyperLogLogState *cState) +{ + memset(cState->regs, 0, sizeof(cState->regs)); +} + +/* + * Adds element to the estimator, from caller-supplied hash. + * + * It is critical that the hash value passed be an actual hash value, typically + * generated using hash_any(). The algorithm relies on a specific bit-pattern + * observable in conjunction with stochastic averaging. There must be a + * uniform distribution of bits in hash values for each distinct original value + * observed. + */ +void +addSHLL(HyperLogLogState *cState, uint32 hash) +{ + uint8 count; + uint32 index; + size_t i; + size_t j; + + TimestampTz now = GetCurrentTimestamp(); + /* Use the first "k" (registerWidth) bits as a zero based index */ + index = hash >> HLL_C_BITS; + + /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ + count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); + + cState->regs[index][count] = now; +} + +static uint8 +getMaximum(const TimestampTz* reg, TimestampTz since) +{ + uint8 max = 0; + + for (size_t i = 0; i < HLL_C_BITS + 1; i++) + { + if (reg[i] >= since) + { + max = i; + } + } + + return max; +} + + +/* + * Estimates cardinality, based on elements added so far + */ +double +estimateSHLL(HyperLogLogState *cState, time_t duration) +{ + double result; + double sum = 0.0; + size_t i; + uint8 R[HLL_N_REGISTERS]; + /* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */ + TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + R[i] = getMaximum(cState->regs[i], since); + sum += 1.0 / pow(2.0, R[i]); + } + + /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ + result = ALPHA_MM / sum; + + if (result <= (5.0 / 2.0) * HLL_N_REGISTERS) + { + /* Small range correction */ + int zero_count = 0; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + zero_count += R[i] == 0; + } + + if (zero_count != 0) + result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS / + zero_count); + } + else if (result > (1.0 / 30.0) * POW_2_32) + { + /* Large range correction */ + result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); + } + + return result; +} + diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h new file mode 100644 index 0000000000..9256cb9afa --- /dev/null +++ b/pgxn/neon/hll.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * hll.h + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef HLL_H +#define HLL_H + +#define HLL_BIT_WIDTH 10 +#define HLL_C_BITS (32 - HLL_BIT_WIDTH) +#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH) + +/* + * HyperLogLog is an approximate technique for computing the number of distinct + * entries in a set. Importantly, it does this by using a fixed amount of + * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal + * cardinality estimation algorithm" for more. + * + * Instead of a single counter for every bits register, we have a timestamp + * for every valid number of bits we can encounter. Every time we encounter + * a certain number of bits, we update the timestamp in those registers to + * the current timestamp. + * + * We can query the sketch's stored cardinality for the range of some timestamp + * up to now: For each register, we return the highest bits bucket that has a + * modified timestamp >= the query timestamp. This value is the number of bits + * for this register in the normal HLL calculation. + * + * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB. + * Usage could be halved if we decide to reduce the required time dimension + * precision; as 32 bits in second precision should be enough for statistics. + * However, that is not yet implemented. + */ +typedef struct HyperLogLogState +{ + TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1]; +} HyperLogLogState; + +extern void initSHLL(HyperLogLogState *cState); +extern void addSHLL(HyperLogLogState *cState, uint32 hash); +extern double estimateSHLL(HyperLogLogState *cState, time_t dutration); + +#endif diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index a665cafafe..73a001b6ba 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -427,12 +427,17 @@ pageserver_connect(shardno_t shard_no, int elevel) values[n_pgsql_params] = NULL; shard->conn = PQconnectStartParams(keywords, values, 1); - if (!shard->conn) + if (PQstatus(shard->conn) == CONNECTION_BAD) { - neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory"); + char *msg = pchomp(PQerrorMessage(shard->conn)); + CLEANUP_AND_DISCONNECT(shard); + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); return false; } - shard->state = PS_Connecting_Startup; /* fallthrough */ } diff --git a/pgxn/neon/neon--1.3--1.4.sql b/pgxn/neon/neon--1.3--1.4.sql new file mode 100644 index 0000000000..042effe346 --- /dev/null +++ b/pgxn/neon/neon--1.3--1.4.sql @@ -0,0 +1,9 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit + +CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null) +RETURNS integer +AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds' +LANGUAGE C PARALLEL SAFE; + +GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor; + diff --git a/pgxn/neon/neon--1.4--1.3.sql b/pgxn/neon/neon--1.4--1.3.sql new file mode 100644 index 0000000000..bea72d1a6b --- /dev/null +++ b/pgxn/neon/neon--1.4--1.3.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 276d1542fe..fe8e276d1c 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -12,6 +12,8 @@ #include "fmgr.h" #include "miscadmin.h" +#include "access/subtrans.h" +#include "access/twophase.h" #include "access/xact.h" #include "access/xlog.h" #include "storage/buf_internals.h" @@ -22,12 +24,15 @@ #include "replication/logical.h" #include "replication/slot.h" #include "replication/walsender.h" +#include "storage/proc.h" #include "storage/procsignal.h" #include "tcop/tcopprot.h" #include "funcapi.h" #include "access/htup_details.h" +#include "utils/builtins.h" #include "utils/pg_lsn.h" #include "utils/guc.h" +#include "utils/guc_tables.h" #include "utils/wait_event.h" #include "extension_server.h" @@ -41,7 +46,21 @@ PG_MODULE_MAGIC; void _PG_init(void); static int logical_replication_max_snap_files = 300; -bool primary_is_running = false; + +static int running_xacts_overflow_policy; + +enum RunningXactsOverflowPolicies { + OP_IGNORE, + OP_SKIP, + OP_WAIT +}; + +static const struct config_enum_entry running_xacts_overflow_policies[] = { + {"ignore", OP_IGNORE, false}, + {"skip", OP_SKIP, false}, + {"wait", OP_WAIT, false}, + {NULL, 0, false} +}; static void InitLogicalReplicationMonitor(void) @@ -50,10 +69,10 @@ InitLogicalReplicationMonitor(void) DefineCustomIntVariable( "neon.logical_replication_max_snap_files", - "Maximum allowed logical replication .snap files", + "Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.", NULL, &logical_replication_max_snap_files, - 300, 0, INT_MAX, + 300, -1, INT_MAX, PGC_SIGHUP, 0, NULL, NULL, NULL); @@ -173,6 +192,13 @@ LogicalSlotsMonitorMain(Datum main_arg) { XLogRecPtr cutoff_lsn; + /* In case of a SIGHUP, just reload the configuration. */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + /* * If there are too many .snap files, just drop all logical slots to * prevent aux files bloat. @@ -267,6 +293,339 @@ LogicalSlotsMonitorMain(Datum main_arg) } } +/* + * XXX: These private to procarray.c, but we need them here. + */ +#define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts) +#define TOTAL_MAX_CACHED_SUBXIDS \ + ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) + +/* + * Restore running-xact information by scanning the CLOG at startup. + * + * In PostgreSQL, a standby always has to wait for a running-xacts WAL record + * to arrive before it can start accepting queries. Furthermore, if there are + * transactions with too many subxids (> 64) open to fit in the in-memory + * subxids cache, the running-xacts record will be marked as "suboverflowed", + * and the standby will need to also wait for the currently in-progress + * transactions to finish. + * + * That's not great in PostgreSQL, because a hot standby does not necessary + * open up for queries immediately as you might expect. But it's worse in + * Neon: A standby in Neon doesn't need to start WAL replay from a checkpoint + * record; it can start at any LSN. Postgres arranges things so that there is + * a running-xacts record soon after every checkpoint record, but when you + * start from an arbitrary LSN, that doesn't help. If the primary is idle, or + * not running at all, it might never write a new running-xacts record, + * leaving the replica in a limbo where it can never start accepting queries. + * + * To mitigate that, we have an additional mechanism to find the running-xacts + * information: we scan the CLOG, making note of any XIDs not marked as + * committed or aborted. They are added to the Postgres known-assigned XIDs + * array by calling ProcArrayApplyRecoveryInfo() in the caller of this + * function. + * + * There is one big limitation with that mechanism: The size of the + * known-assigned XIDs is limited, so if there are a lot of in-progress XIDs, + * we have to give up. Furthermore, we don't know how many of the in-progress + * XIDs are subtransactions, and if we use up all the space in the + * known-assigned XIDs array for subtransactions, we might run out of space in + * the array later during WAL replay, causing the replica to shut down with + * "ERROR: too many KnownAssignedXids". The safe # of XIDs that we can add to + * the known-assigned array without risking that error later is very low, + * merely PGPROC_MAX_CACHED_SUBXIDS == 64, so we take our chances and use up + * to half of the known-assigned XIDs array for the subtransactions, even + * though that risks getting the error later. + * + * Note: It's OK if the recovered list of XIDs includes some transactions that + * have crashed in the primary, and hence will never commit. They will be seen + * as in-progress, until we see a new next running-acts record with an + * oldestActiveXid that invalidates them. That's how the known-assigned XIDs + * array always works. + * + * If scraping the CLOG doesn't succeed for some reason, like the subxid + * overflow, Postgres will fall back to waiting for a running-xacts record + * like usual. + * + * Returns true if a complete list of in-progress XIDs was scraped. + */ +static bool +RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *nxids) +{ + TransactionId from; + TransactionId till; + int max_xcnt; + TransactionId *prepared_xids = NULL; + int n_prepared_xids; + TransactionId *restored_xids = NULL; + int n_restored_xids; + int next_prepared_idx; + + Assert(*xids == NULL); + + /* + * If the checkpoint doesn't have a valid oldestActiveXid, bail out. We + * don't know where to start the scan. + * + * This shouldn't happen, because the pageserver always maintains a valid + * oldestActiveXid nowadays. Except when starting at an old point in time + * that was ingested before the pageserver was taught to do that. + */ + if (!TransactionIdIsValid(checkpoint->oldestActiveXid)) + { + elog(LOG, "cannot restore running-xacts from CLOG because oldestActiveXid is not set"); + goto fail; + } + + /* + * We will scan the CLOG starting from the oldest active XID. + * + * In some corner cases, the oldestActiveXid from the last checkpoint + * might already have been truncated from the CLOG. That is, + * oldestActiveXid might be older than oldestXid. That's possible because + * oldestActiveXid is only updated at checkpoints. After the last + * checkpoint, the oldest transaction might have committed, and the CLOG + * might also have been already truncated. So if oldestActiveXid is older + * than oldestXid, start at oldestXid instead. (Otherwise we'd try to + * access CLOG segments that have already been truncated away.) + */ + from = TransactionIdPrecedes(checkpoint->oldestXid, checkpoint->oldestActiveXid) + ? checkpoint->oldestActiveXid : checkpoint->oldestXid; + till = XidFromFullTransactionId(checkpoint->nextXid); + + /* + * To avoid "too many KnownAssignedXids" error later during replay, we + * limit number of collected transactions. This is a tradeoff: if we are + * willing to consume more of the KnownAssignedXids space for the XIDs + * now, that allows us to start up, but we might run out of space later. + * + * The size of the KnownAssignedXids array is TOTAL_MAX_CACHED_SUBXIDS, + * which is (PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS). In + * PostgreSQL, that's always enough because the primary will always write + * an XLOG_XACT_ASSIGNMENT record if a transaction has more than + * PGPROC_MAX_CACHED_SUBXIDS subtransactions. Seeing that record allows + * the standby to mark the XIDs in pg_subtrans and removing them from the + * KnowingAssignedXids array. + * + * Here, we don't know which XIDs belong to subtransactions that have + * already been WAL-logged with an XLOG_XACT_ASSIGNMENT record. If we + * wanted to be totally safe and avoid the possibility of getting a "too + * many KnownAssignedXids" error later, we would have to limit ourselves + * to PGPROC_MAX_CACHED_SUBXIDS, which is not much. And that includes top + * transaction IDs too, because we cannot distinguish between top + * transaction IDs and subtransactions here. + * + * Somewhat arbitrarily, we use up to half of KnownAssignedXids. That + * strikes a sensible balance between being useful, and risking a "too + * many KnownAssignedXids" error later. + */ + max_xcnt = TOTAL_MAX_CACHED_SUBXIDS / 2; + + /* + * Collect XIDs of prepared transactions in an array. This includes only + * their top-level XIDs. We assume that StandbyRecoverPreparedTransactions + * has already been called, so we can find all the sub-transactions in + * pg_subtrans. + */ + PrescanPreparedTransactions(&prepared_xids, &n_prepared_xids); + qsort(prepared_xids, n_prepared_xids, sizeof(TransactionId), xidLogicalComparator); + + /* + * Scan the CLOG, collecting in-progress XIDs into 'restored_xids'. + */ + elog(DEBUG1, "scanning CLOG between %u and %u for in-progress XIDs", from, till); + restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId)); + n_restored_xids = 0; + next_prepared_idx = 0; + + for (TransactionId xid = from; xid != till;) + { + XLogRecPtr xidlsn; + XidStatus xidstatus; + + xidstatus = TransactionIdGetStatus(xid, &xidlsn); + + /* + * "Merge" the prepared transactions into the restored_xids array as + * we go. The prepared transactions array is sorted. This is mostly + * a sanity check to ensure that all the prepared transactions are + * seen as in-progress. (There is a check after the loop that we didn't + * miss any.) + */ + if (next_prepared_idx < n_prepared_xids && xid == prepared_xids[next_prepared_idx]) + { + /* + * This is a top-level transaction ID of a prepared transaction. + * Include it in the array. + */ + + /* sanity check */ + if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS) + { + elog(LOG, "prepared transaction %u has unexpected status %X, cannot restore running-xacts from CLOG", + xid, xidstatus); + Assert(false); + goto fail; + } + + elog(DEBUG1, "XID %u: was next prepared xact (%d / %d)", xid, next_prepared_idx, n_prepared_xids); + next_prepared_idx++; + } + else if (xidstatus == TRANSACTION_STATUS_COMMITTED) + { + elog(DEBUG1, "XID %u: was committed", xid); + goto skip; + } + else if (xidstatus == TRANSACTION_STATUS_ABORTED) + { + elog(DEBUG1, "XID %u: was aborted", xid); + goto skip; + } + else if (xidstatus == TRANSACTION_STATUS_IN_PROGRESS) + { + /* + * In-progress transactions are included in the array. + * + * Except subtransactions of the prepared transactions. They are + * already set in pg_subtrans, and hence don't need to be tracked + * in the known-assigned XIDs array. + */ + if (n_prepared_xids > 0) + { + TransactionId parent = SubTransGetParent(xid); + + if (TransactionIdIsValid(parent)) + { + /* + * This is a subtransaction belonging to a prepared + * transaction. + * + * Sanity check that it is in the prepared XIDs array. It + * should be, because StandbyRecoverPreparedTransactions + * populated pg_subtrans, and no other XID should be set + * in it yet. (This also relies on the fact that + * StandbyRecoverPreparedTransactions sets the parent of + * each subxid to point directly to the top-level XID, + * rather than restoring the original subtransaction + * hierarchy.) + */ + if (bsearch(&parent, prepared_xids, next_prepared_idx, + sizeof(TransactionId), xidLogicalComparator) == NULL) + { + elog(LOG, "sub-XID %u has unexpected parent %u, cannot restore running-xacts from CLOG", + xid, parent); + Assert(false); + goto fail; + } + elog(DEBUG1, "XID %u: was a subtransaction of prepared xid %u", xid, parent); + goto skip; + } + } + + /* include it in the array */ + elog(DEBUG1, "XID %u: is in progress", xid); + } + else + { + /* + * SUB_COMMITTED is a transient state used at commit. We don't + * expect to see that here. + */ + elog(LOG, "XID %u has unexpected status %X in pg_xact, cannot restore running-xacts from CLOG", + xid, xidstatus); + Assert(false); + goto fail; + } + + if (n_restored_xids >= max_xcnt) + { + /* + * Overflowed. We won't be able to install the RunningTransactions + * snapshot. + */ + elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u", + checkpoint->oldestXid, checkpoint->oldestActiveXid, + XidFromFullTransactionId(checkpoint->nextXid)); + + switch (running_xacts_overflow_policy) + { + case OP_WAIT: + goto fail; + case OP_IGNORE: + goto success; + case OP_SKIP: + n_restored_xids = 0; + goto success; + } + } + + restored_xids[n_restored_xids++] = xid; + + skip: + TransactionIdAdvance(xid); + } + + /* sanity check */ + if (next_prepared_idx != n_prepared_xids) + { + elog(LOG, "prepared transaction ID %u was not visited in the CLOG scan, cannot restore running-xacts from CLOG", + prepared_xids[next_prepared_idx]); + Assert(false); + goto fail; + } + success: + elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u", + n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid)); + *nxids = n_restored_xids; + *xids = restored_xids; + if (prepared_xids) + pfree(prepared_xids); + return true; + + fail: + *nxids = 0; + *xids = NULL; + if (restored_xids) + pfree(restored_xids); + if (prepared_xids) + pfree(prepared_xids); + return false; +} + + +/* + * pgbouncer is able to track GUCs reported by Postgres. + * But most parameters cannot be tracked this way. The only parameters that can be tracked are ones + * that Postgres reports to the client. Unfortunately `search_path` is not reported by Postgres: + * https://www.postgresql.org/message-id/flat/CAGECzQQ6xFcgrg%2Be0p9mCumtK362TiA6vTiiZKoYbS8OXggwuQ%40mail.gmail.com#be4bfd7a9cf1f0633bdb2d1790a0a1be + * This code sets GUC_REPORT flag for `search_path`making it possible to include it in + * pgbouncer's `track_extra_parameters` list. + * + * This code is inspired by how the Citus extension does this, see + * https://github.com/citusdata/citus/blob/2a263fe69a707d16ef24378f7650742386b0968f/src/backend/distributed/shared_library_init.c#L2694 + */ +static void +ReportSearchPath(void) +{ +#if PG_VERSION_NUM >= 160000 + int nGucs = 0; + struct config_generic **gucs = get_guc_variables(&nGucs); +#else + struct config_generic **gucs = get_guc_variables(); + int nGucs = GetNumConfigOptions(); +#endif + + for (int i = 0; i < nGucs; i++) + { + struct config_generic *guc = (struct config_generic *) gucs[i]; + + if (strcmp(guc->name, "search_path") == 0) + { + guc->flags |= GUC_REPORT; + } + } +} + void _PG_init(void) { @@ -280,8 +639,9 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); - WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitLogicalReplicationMonitor(); @@ -289,21 +649,28 @@ _PG_init(void) pg_init_extension_server(); - DefineCustomBoolVariable( - "neon.primary_is_running", - "true if the primary was running at replica startup. false otherwise", - NULL, - &primary_is_running, - false, - PGC_POSTMASTER, - 0, - NULL, NULL, NULL); + restore_running_xacts_callback = RestoreRunningXactsFromClog; + + + DefineCustomEnumVariable( + "neon.running_xacts_overflow_policy", + "Action performed on snapshot overflow when restoring runnings xacts from CLOG", + NULL, + &running_xacts_overflow_policy, + OP_IGNORE, + running_xacts_overflow_policies, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the * extension was loaded will be removed. */ EmitWarningsOnPlaceholders("neon"); + + ReportSearchPath(); } PG_FUNCTION_INFO_V1(pg_cluster_size); diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control index cee2f336f2..03bdb9a0b4 100644 --- a/pgxn/neon/neon.control +++ b/pgxn/neon/neon.control @@ -1,6 +1,6 @@ # neon extension comment = 'cloud storage for PostgreSQL' -default_version = '1.3' +default_version = '1.4' module_pathname = '$libdir/neon' relocatable = true trusted = true diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index f19732cbbb..addb6ccce6 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -54,6 +54,10 @@ #define BufTagGetNRelFileInfo(tag) tag.rnode +#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode) + +#define InvalidRelFileNumber InvalidOid + #define SMgrRelGetRelInfo(reln) \ (reln->smgr_rnode.node) diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index 60eb8e1fc9..b575712dbe 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -109,11 +109,12 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_ { NeonWALReader *reader; + /* + * Note: we allocate in TopMemoryContext, reusing the reader for all process + * reads. + */ reader = (NeonWALReader *) - palloc_extended(sizeof(NeonWALReader), - MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); - if (!reader) - return NULL; + MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader)); reader->available_lsn = available_lsn; reader->seg.ws_file = -1; @@ -219,7 +220,8 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou return NEON_WALREAD_ERROR; } /* we'll poll immediately */ - state->rem_state = RS_CONNECTING_READ; + state->rem_state = RS_CONNECTING_WRITE; + return NEON_WALREAD_WOULDBLOCK; } if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE) diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index dbc67a24f5..c53257923a 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1447,7 +1447,7 @@ RecvAppendResponses(Safekeeper *sk) * core as this is kinda expected scenario. */ disable_core_dump(); - wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", + wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us", sk->host, sk->port, sk->appendResponse.term, wp->propTerm); } diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index da1a6f76f0..f3ddc64061 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -63,6 +63,8 @@ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; +/* Set to true in the walproposer bgw. */ +static bool am_walproposer; static WalproposerShmemState *walprop_shared; static WalProposerConfig walprop_config; static XLogRecPtr sentPtr = InvalidXLogRecPtr; @@ -76,6 +78,7 @@ static HotStandbyFeedback agg_hs_feedback; static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); +static void assign_neon_safekeepers(const char *newval, void *extra); static void nwp_prepare_shmem(void); static uint64 backpressure_lag_impl(void); static bool backpressure_throttling_impl(void); @@ -111,7 +114,8 @@ init_walprop_config(bool syncSafekeepers) { walprop_config.neon_tenant = neon_tenant; walprop_config.neon_timeline = neon_timeline; - walprop_config.safekeepers_list = wal_acceptors_list; + /* WalProposerCreate scribbles directly on it, so pstrdup */ + walprop_config.safekeepers_list = pstrdup(wal_acceptors_list); walprop_config.safekeeper_reconnect_timeout = wal_acceptor_reconnect_timeout; walprop_config.safekeeper_connection_timeout = wal_acceptor_connection_timeout; walprop_config.wal_segment_size = wal_segment_size; @@ -151,6 +155,7 @@ WalProposerMain(Datum main_arg) init_walprop_config(false); walprop_pg_init_bgworker(); + am_walproposer = true; walprop_pg_load_libpqwalreceiver(); wp = WalProposerCreate(&walprop_config, walprop_pg); @@ -189,10 +194,10 @@ nwp_register_gucs(void) NULL, /* long_desc */ &wal_acceptors_list, /* valueAddr */ "", /* bootValue */ - PGC_POSTMASTER, + PGC_SIGHUP, GUC_LIST_INPUT, /* extensions can't use* * GUC_LIST_QUOTE */ - NULL, NULL, NULL); + NULL, assign_neon_safekeepers, NULL); DefineCustomIntVariable( "neon.safekeeper_reconnect_timeout", @@ -215,6 +220,33 @@ nwp_register_gucs(void) NULL, NULL, NULL); } +/* + * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if + * the list changed. + */ +static void +assign_neon_safekeepers(const char *newval, void *extra) +{ + if (!am_walproposer) + return; + + if (!newval) { + /* should never happen */ + wpg_log(FATAL, "neon.safekeepers is empty"); + } + + /* + * TODO: restarting through FATAL is stupid and introduces 1s delay before + * next bgw start. We should refactor walproposer to allow graceful exit and + * thus remove this delay. + */ + if (strcmp(wal_acceptors_list, newval) != 0) + { + wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s", + wal_acceptors_list, newval); + } +} + /* Check if we need to suspend inserts because of lagging replication. */ static uint64 backpressure_lag_impl(void) @@ -363,7 +395,7 @@ walprop_register_bgworker(void) snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); - bgw.bgw_restart_time = 5; + bgw.bgw_restart_time = 1; bgw.bgw_notify_pid = 0; bgw.bgw_main_arg = (Datum) 0; @@ -480,7 +512,7 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe } /* - * Start walsender streaming replication + * Start walproposer streaming replication */ static void walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) @@ -1639,6 +1671,18 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 late_cv_trigger = ConditionVariableCancelSleep(); #endif + /* + * Process config if requested. This restarts walproposer if safekeepers + * list changed. Don't do that for sync-safekeepers because quite probably + * it (re-reading config) won't work without some effort, and + * sync-safekeepers should be quick to finish anyway. + */ + if (!wp->config->syncSafekeepers && ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + /* * If wait is terminated by latch set (walsenders' latch is set on each * wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH) diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c index 8f8d1dfc01..bd3856e9d9 100644 --- a/pgxn/neon/walsender_hooks.c +++ b/pgxn/neon/walsender_hooks.c @@ -20,6 +20,7 @@ #include "utils/guc.h" #include "postmaster/interrupt.h" +#include "neon.h" #include "neon_walreader.h" #include "walproposer.h" @@ -181,6 +182,13 @@ NeonWALReadSegmentClose(XLogReaderState *xlogreader) void NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) { + /* + * If safekeepers are not configured, assume we don't need neon_walreader, + * i.e. running neon fork locally. + */ + if (wal_acceptors_list[0] == '\0') + return; + if (!wal_reader) { XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn); diff --git a/pgxn/neon_rmgr/neon_rmgr.c b/pgxn/neon_rmgr/neon_rmgr.c index 496ca08c08..c3f726db84 100644 --- a/pgxn/neon_rmgr/neon_rmgr.c +++ b/pgxn/neon_rmgr/neon_rmgr.c @@ -186,7 +186,7 @@ static void fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) { *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | - HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); + HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK | HEAP_COMBOCID); *infomask2 &= ~HEAP_KEYS_UPDATED; if (infobits & XLHL_XMAX_IS_MULTI) @@ -195,6 +195,8 @@ fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) *infomask |= HEAP_XMAX_LOCK_ONLY; if (infobits & XLHL_XMAX_EXCL_LOCK) *infomask |= HEAP_XMAX_EXCL_LOCK; + if (infobits & XLHL_COMBOCID) + *infomask |= HEAP_COMBOCID; /* note HEAP_XMAX_SHR_LOCK isn't considered here */ if (infobits & XLHL_XMAX_KEYSHR_LOCK) *infomask |= HEAP_XMAX_KEYSHR_LOCK; @@ -284,7 +286,7 @@ redo_neon_heap_insert(XLogReaderState *record) htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid; htup->t_ctid = target_tid; if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, @@ -373,7 +375,7 @@ redo_neon_heap_delete(XLogReaderState *record) HeapTupleHeaderSetXmax(htup, xlrec->xmax); else HeapTupleHeaderSetXmin(htup, InvalidTransactionId); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); @@ -490,7 +492,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update) fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; /* Set forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -623,7 +625,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update) htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid; HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -728,7 +730,7 @@ redo_neon_heap_lock(XLogReaderState *record) offnum); } HeapTupleHeaderSetXmax(htup, xlrec->xmax); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } @@ -840,7 +842,7 @@ redo_neon_heap_multi_insert(XLogReaderState *record) htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlrec->t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile index 1ee87357e5..252810b5b0 100644 --- a/pgxn/neon_test_utils/Makefile +++ b/pgxn/neon_test_utils/Makefile @@ -7,7 +7,7 @@ OBJS = \ neontest.o EXTENSION = neon_test_utils -DATA = neon_test_utils--1.1.sql +DATA = neon_test_utils--1.3.sql PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" PG_CONFIG = pg_config diff --git a/pgxn/neon_test_utils/neon_test_utils--1.1.sql b/pgxn/neon_test_utils/neon_test_utils--1.3.sql similarity index 69% rename from pgxn/neon_test_utils/neon_test_utils--1.1.sql rename to pgxn/neon_test_utils/neon_test_utils--1.3.sql index 534784f319..9a9b41c3a3 100644 --- a/pgxn/neon_test_utils/neon_test_utils--1.1.sql +++ b/pgxn/neon_test_utils/neon_test_utils--1.3.sql @@ -7,6 +7,12 @@ AS 'MODULE_PATHNAME', 'test_consume_xids' LANGUAGE C STRICT PARALLEL UNSAFE; +CREATE FUNCTION test_consume_oids(oid int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_oids' +LANGUAGE C STRICT +PARALLEL UNSAFE; + CREATE FUNCTION test_consume_cpu(seconds int) RETURNS VOID AS 'MODULE_PATHNAME', 'test_consume_cpu' @@ -41,7 +47,25 @@ RETURNS bytea AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' LANGUAGE C PARALLEL UNSAFE; -CREATE FUNCTION neon_xlogflush(lsn pg_lsn) +CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL) RETURNS VOID AS 'MODULE_PATHNAME', 'neon_xlogflush' LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION trigger_panic() +RETURNS VOID +AS 'MODULE_PATHNAME', 'trigger_panic' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION trigger_segfault() +RETURNS VOID +AS 'MODULE_PATHNAME', 'trigger_segfault' +LANGUAGE C PARALLEL UNSAFE; + +-- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun +CREATE OR REPLACE FUNCTION 💣() RETURNS void +LANGUAGE plpgsql AS $$ +BEGIN + PERFORM trigger_segfault(); +END; +$$; diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control index 5f6d640835..f22afd70c4 100644 --- a/pgxn/neon_test_utils/neon_test_utils.control +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -1,6 +1,6 @@ # neon_test_utils extension comment = 'helpers for neon testing and debugging' -default_version = '1.1' +default_version = '1.3' module_pathname = '$libdir/neon_test_utils' relocatable = true trusted = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 47f245fbf1..0b5499ca53 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -15,6 +15,7 @@ #include "access/relation.h" #include "access/xact.h" #include "access/xlog.h" +#include "access/xlog_internal.h" #include "catalog/namespace.h" #include "fmgr.h" #include "funcapi.h" @@ -34,6 +35,7 @@ PG_MODULE_MAGIC; extern void _PG_init(void); PG_FUNCTION_INFO_V1(test_consume_xids); +PG_FUNCTION_INFO_V1(test_consume_oids); PG_FUNCTION_INFO_V1(test_consume_cpu); PG_FUNCTION_INFO_V1(test_consume_memory); PG_FUNCTION_INFO_V1(test_release_memory); @@ -41,6 +43,8 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); PG_FUNCTION_INFO_V1(neon_xlogflush); +PG_FUNCTION_INFO_V1(trigger_panic); +PG_FUNCTION_INFO_V1(trigger_segfault); /* * Linkage to functions in neon module. @@ -71,6 +75,21 @@ _PG_init(void) #define neon_read_at_lsn neon_read_at_lsn_ptr +/* + * test_consume_oids(int4), for rapidly consuming OIDs, to test wraparound. + * Unlike test_consume_xids which is passed number of xids to be consumed, + * this function is given the target Oid. + */ +Datum +test_consume_oids(PG_FUNCTION_ARGS) +{ + int32 oid = PG_GETARG_INT32(0); + + while (oid != GetNewObjectId()); + + PG_RETURN_VOID(); +} + /* * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. */ @@ -444,12 +463,68 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) /* * Directly calls XLogFlush(lsn) to flush WAL buffers. + * + * If 'lsn' is not specified (is NULL), flush all generated WAL. */ Datum neon_xlogflush(PG_FUNCTION_ARGS) { - XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogRecPtr lsn; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("cannot flush WAL during recovery."))); + + if (!PG_ARGISNULL(0)) + lsn = PG_GETARG_LSN(0); + else + { + lsn = GetXLogInsertRecPtr(); + + /*--- + * The LSN returned by GetXLogInsertRecPtr() is the position where the + * next inserted record would begin. If the last record ended just at + * the page boundary, the next record will begin after the page header + * on the next page, but the next page's page header has not been + * written yet. If we tried to flush it, XLogFlush() would throw an + * error: + * + * ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X + * + * To avoid that, if the insert position points to just after the page + * header, back off to page boundary. + */ + if (lsn % XLOG_BLCKSZ == SizeOfXLogShortPHD && + XLogSegmentOffset(lsn, wal_segment_size) > XLOG_BLCKSZ) + lsn -= SizeOfXLogShortPHD; + else if (lsn % XLOG_BLCKSZ == SizeOfXLogLongPHD && + XLogSegmentOffset(lsn, wal_segment_size) < XLOG_BLCKSZ) + lsn -= SizeOfXLogLongPHD; + } XLogFlush(lsn); PG_RETURN_VOID(); } + +/* + * Function to trigger panic. + */ +Datum +trigger_panic(PG_FUNCTION_ARGS) +{ + elog(PANIC, "neon_test_utils: panic"); + PG_RETURN_VOID(); +} + +/* + * Function to trigger a segfault. + */ +Datum +trigger_segfault(PG_FUNCTION_ARGS) +{ + int *ptr = NULL; + *ptr = 42; + PG_RETURN_VOID(); +} diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index c4ab22636b..cc545393f5 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -168,16 +168,15 @@ close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flag static void enter_seccomp_mode(void) { - /* * The pageserver process relies on us to close all the file descriptors * it potentially leaked to us, _before_ we start processing potentially dangerous * wal records. See the comment in the Rust code that launches this process. */ - int err; - if (err = close_range_syscall(3, ~0U, 0)) { - ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3"))); - } + if (close_range_syscall(3, ~0U, 0) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not close files >= fd 3"))); PgSeccompRule syscalls[] = { diff --git a/poetry.lock b/poetry.lock index 7740388fb8..7db91e51f7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,91 +1,103 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. + +[[package]] +name = "aiohappyeyeballs" +version = "2.3.5" +description = "Happy Eyeballs for asyncio" +optional = false +python-versions = ">=3.8" +files = [ + {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"}, + {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"}, +] [[package]] name = "aiohttp" -version = "3.9.4" +version = "3.10.2" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" files = [ - {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"}, - {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"}, - {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"}, - {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"}, - {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"}, - {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"}, - {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"}, - {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"}, - {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"}, - {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"}, - {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"}, - {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"}, - {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"}, - {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"}, - {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"}, - {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"}, - {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"}, - {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"}, - {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"}, - {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"}, - {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"}, - {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"}, - {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"}, - {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"}, - {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"}, - {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"}, + {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"}, + {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"}, + {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"}, + {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"}, + {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"}, + {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"}, + {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"}, + {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"}, + {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"}, + {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"}, + {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"}, ] [package.dependencies] +aiohappyeyeballs = ">=2.3.0" aiosignal = ">=1.1.2" async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""} attrs = ">=17.3.0" @@ -94,7 +106,7 @@ multidict = ">=4.5,<7.0" yarl = ">=1.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns", "brotlicffi"] +speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] [[package]] name = "aiopg" @@ -734,13 +746,13 @@ typing-extensions = ">=4.1.0" [[package]] name = "certifi" -version = "2023.7.22" +version = "2024.7.4" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"}, - {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"}, + {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, + {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, ] [[package]] @@ -870,6 +882,96 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "clickhouse-connect" +version = "0.7.17" +description = "ClickHouse Database Core Driver for Python, Pandas, and Superset" +optional = false +python-versions = "~=3.8" +files = [ + {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"}, +] + +[package.dependencies] +certifi = "*" +lz4 = "*" +pytz = "*" +urllib3 = ">=1.26" +zstandard = "*" + +[package.extras] +arrow = ["pyarrow"] +numpy = ["numpy"] +orjson = ["orjson"] +pandas = ["pandas"] +sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"] +tzlocal = ["tzlocal (>=4.0)"] + [[package]] name = "colorama" version = "0.4.5" @@ -1424,6 +1526,20 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "kafka-python" +version = "2.0.2" +description = "Pure Python client for Apache Kafka" +optional = false +python-versions = "*" +files = [ + {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"}, + {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"}, +] + +[package.extras] +crc32c = ["crc32c"] + [[package]] name = "lazy-object-proxy" version = "1.10.0" @@ -1470,6 +1586,56 @@ files = [ {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"}, ] +[[package]] +name = "lz4" +version = "4.3.3" +description = "LZ4 Bindings for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"}, + {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"}, + {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"}, + {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"}, + {file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"}, + {file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"}, + {file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"}, + {file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"}, + {file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"}, + {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"}, + {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"}, + {file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"}, + {file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"}, + {file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"}, + {file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"}, + {file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"}, + {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"}, + {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"}, + {file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"}, + {file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"}, + {file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"}, + {file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"}, + {file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"}, + {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"}, + {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"}, + {file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"}, + {file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"}, + {file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"}, + {file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"}, + {file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"}, + {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"}, + {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"}, + {file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"}, + {file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"}, + {file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"}, + {file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"}, +] + +[package.extras] +docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"] +flake8 = ["flake8"] +tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"] + [[package]] name = "markupsafe" version = "2.1.1" @@ -2361,6 +2527,17 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "pytz" +version = "2024.1" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, + {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, +] + [[package]] name = "pywin32" version = "301" @@ -2641,19 +2818,18 @@ pbr = "*" [[package]] name = "setuptools" -version = "65.5.1" +version = "70.0.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"}, - {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"}, + {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, + {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" @@ -3133,18 +3309,18 @@ multidict = ">=4.0" [[package]] name = "zipp" -version = "3.8.1" +version = "3.19.1" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"}, - {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"}, + {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"}, + {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"}, ] [package.extras] -docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"] -testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [[package]] name = "zstandard" @@ -3207,4 +3383,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0" +content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 288f7769fe..21d92abb20 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -11,6 +11,7 @@ testing = [] [dependencies] ahash.workspace = true anyhow.workspace = true +arc-swap.workspace = true async-compression.workspace = true async-trait.workspace = true atomic-take.workspace = true @@ -73,7 +74,7 @@ rustls.workspace = true scopeguard.workspace = true serde.workspace = true serde_json.workspace = true -sha2 = { workspace = true, features = ["asm"] } +sha2 = { workspace = true, features = ["asm", "oid"] } smol_str.workspace = true smallvec.workspace = true socket2.workspace = true @@ -92,6 +93,8 @@ tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true +try-lock.workspace = true +typed-json.workspace = true url.workspace = true urlencoding.workspace = true utils.workspace = true @@ -101,6 +104,14 @@ x509-parser.workspace = true postgres-protocol.workspace = true redis.workspace = true +# jwt stuff +jose-jwa = "0.1.2" +jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] } +signature = "2" +ecdsa = "0.16" +p256 = "0.13" +rsa = "0.9" + workspace_hack.workspace = true [dev-dependencies] diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 8c44823c98..3b3c571129 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -113,38 +113,36 @@ impl> From for AuthError { impl UserFacingError for AuthError { fn to_string_client(&self) -> String { - use AuthErrorImpl::*; match self.0.as_ref() { - Link(e) => e.to_string_client(), - GetAuthInfo(e) => e.to_string_client(), - Sasl(e) => e.to_string_client(), - AuthFailed(_) => self.to_string(), - BadAuthMethod(_) => self.to_string(), - MalformedPassword(_) => self.to_string(), - MissingEndpointName => self.to_string(), - Io(_) => "Internal error".to_string(), - IpAddressNotAllowed(_) => self.to_string(), - TooManyConnections => self.to_string(), - UserTimeout(_) => self.to_string(), + AuthErrorImpl::Link(e) => e.to_string_client(), + AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(), + AuthErrorImpl::Sasl(e) => e.to_string_client(), + AuthErrorImpl::AuthFailed(_) => self.to_string(), + AuthErrorImpl::BadAuthMethod(_) => self.to_string(), + AuthErrorImpl::MalformedPassword(_) => self.to_string(), + AuthErrorImpl::MissingEndpointName => self.to_string(), + AuthErrorImpl::Io(_) => "Internal error".to_string(), + AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(), + AuthErrorImpl::TooManyConnections => self.to_string(), + AuthErrorImpl::UserTimeout(_) => self.to_string(), } } } impl ReportableError for AuthError { fn get_error_kind(&self) -> crate::error::ErrorKind { - use AuthErrorImpl::*; match self.0.as_ref() { - Link(e) => e.get_error_kind(), - GetAuthInfo(e) => e.get_error_kind(), - Sasl(e) => e.get_error_kind(), - AuthFailed(_) => crate::error::ErrorKind::User, - BadAuthMethod(_) => crate::error::ErrorKind::User, - MalformedPassword(_) => crate::error::ErrorKind::User, - MissingEndpointName => crate::error::ErrorKind::User, - Io(_) => crate::error::ErrorKind::ClientDisconnect, - IpAddressNotAllowed(_) => crate::error::ErrorKind::User, - TooManyConnections => crate::error::ErrorKind::RateLimit, - UserTimeout(_) => crate::error::ErrorKind::User, + AuthErrorImpl::Link(e) => e.get_error_kind(), + AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(), + AuthErrorImpl::Sasl(e) => e.get_error_kind(), + AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User, + AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User, + AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User, + AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User, + AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect, + AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, + AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit, + AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User, } } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index f757a15fbb..7592d076ec 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,5 +1,6 @@ mod classic; mod hacks; +pub mod jwt; mod link; use std::net::IpAddr; @@ -79,9 +80,8 @@ pub trait TestBackend: Send + Sync + 'static { impl std::fmt::Display for BackendType<'_, (), ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - use BackendType::*; match self { - Console(api, _) => match &**api { + Self::Console(api, _) => match &**api { ConsoleBackend::Console(endpoint) => { fmt.debug_tuple("Console").field(&endpoint.url()).finish() } @@ -92,7 +92,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> { #[cfg(test)] ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(), }, - Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), + Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), } } } @@ -101,10 +101,9 @@ impl BackendType<'_, T, D> { /// Very similar to [`std::option::Option::as_ref`]. /// This helps us pass structured config to async tasks. pub fn as_ref(&self) -> BackendType<'_, &T, &D> { - use BackendType::*; match self { - Console(c, x) => Console(MaybeOwned::Borrowed(c), x), - Link(c, x) => Link(MaybeOwned::Borrowed(c), x), + Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x), + Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x), } } } @@ -114,10 +113,9 @@ impl<'a, T, D> BackendType<'a, T, D> { /// Maps [`BackendType`] to [`BackendType`] by applying /// a function to a contained value. pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> { - use BackendType::*; match self { - Console(c, x) => Console(c, f(x)), - Link(c, x) => Link(c, x), + Self::Console(c, x) => BackendType::Console(c, f(x)), + Self::Link(c, x) => BackendType::Link(c, x), } } } @@ -125,10 +123,9 @@ impl<'a, T, D, E> BackendType<'a, Result, D> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. pub fn transpose(self) -> Result, E> { - use BackendType::*; match self { - Console(c, x) => x.map(|x| Console(c, x)), - Link(c, x) => Ok(Link(c, x)), + Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)), + Self::Link(c, x) => Ok(BackendType::Link(c, x)), } } } @@ -218,7 +215,7 @@ impl RateBucketInfo { impl AuthenticationConfig { pub fn check_rate_limit( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, config: &AuthenticationConfig, secret: AuthSecret, endpoint: &EndpointId, @@ -243,7 +240,7 @@ impl AuthenticationConfig { let limit_not_exceeded = self.rate_limiter.check( ( endpoint_int, - MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet), + MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet), ), password_weight, ); @@ -274,7 +271,7 @@ impl AuthenticationConfig { /// /// All authentication flows will emit an AuthenticationOk message if successful. async fn auth_quirks( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, api: &impl console::Api, user_info: ComputeUserInfoMaybeEndpoint, client: &mut stream::PqStream>, @@ -292,7 +289,9 @@ async fn auth_quirks( ctx.set_endpoint_id(res.info.endpoint.clone()); let password = match res.keys { ComputeCredentialKeys::Password(p) => p, - _ => unreachable!("password hack should return a password"), + ComputeCredentialKeys::AuthKeys(_) => { + unreachable!("password hack should return a password") + } }; (res.info, Some(password)) } @@ -303,8 +302,8 @@ async fn auth_quirks( let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; // check allowed list - if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr)); + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); } if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { @@ -356,7 +355,7 @@ async fn auth_quirks( } async fn authenticate_with_secret( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, secret: AuthSecret, info: ComputeUserInfo, client: &mut stream::PqStream>, @@ -399,21 +398,17 @@ async fn authenticate_with_secret( impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { /// Get compute endpoint name from the credentials. pub fn get_endpoint(&self) -> Option { - use BackendType::*; - match self { - Console(_, user_info) => user_info.endpoint_id.clone(), - Link(_, _) => Some("link".into()), + Self::Console(_, user_info) => user_info.endpoint_id.clone(), + Self::Link(_, _) => Some("link".into()), } } /// Get username from the credentials. pub fn get_user(&self) -> &str { - use BackendType::*; - match self { - Console(_, user_info) => &user_info.user, - Link(_, _) => "link", + Self::Console(_, user_info) => &user_info.user, + Self::Link(_, _) => "link", } } @@ -421,16 +416,14 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub async fn authenticate( self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, ) -> auth::Result> { - use BackendType::*; - let res = match self { - Console(api, user_info) => { + Self::Console(api, user_info) => { info!( user = &*user_info.user, project = user_info.endpoint(), @@ -450,7 +443,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { BackendType::Console(api, credentials) } // NOTE: this auth backend doesn't use client credentials. - Link(url, _) => { + Self::Link(url, _) => { info!("performing link authentication"); let info = link::authenticate(ctx, &url, client).await?; @@ -467,23 +460,21 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { impl BackendType<'_, ComputeUserInfo, &()> { pub async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result { - use BackendType::*; match self { - Console(api, user_info) => api.get_role_secret(ctx, user_info).await, - Link(_, _) => Ok(Cached::new_uncached(None)), + Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await, + Self::Link(_, _) => Ok(Cached::new_uncached(None)), } } pub async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { - use BackendType::*; match self { - Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await, - Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), + Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await, + Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), } } } @@ -492,20 +483,18 @@ impl BackendType<'_, ComputeUserInfo, &()> { impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> { async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result { - use BackendType::*; - match self { - Console(api, creds) => api.wake_compute(ctx, &creds.info).await, - Link(_, info) => Ok(Cached::new_uncached(info.clone())), + Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await, + Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())), } } fn get_keys(&self) -> Option<&ComputeCredentialKeys> { match self { - BackendType::Console(_, creds) => Some(&creds.keys), - BackendType::Link(_, _) => None, + Self::Console(_, creds) => Some(&creds.keys), + Self::Link(_, _) => None, } } } @@ -514,20 +503,18 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> { impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> { async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result { - use BackendType::*; - match self { - Console(api, creds) => api.wake_compute(ctx, &creds.info).await, - Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"), + Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await, + Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"), } } fn get_keys(&self) -> Option<&ComputeCredentialKeys> { match self { - BackendType::Console(_, creds) => Some(&creds.keys), - BackendType::Link(_, _) => None, + Self::Console(_, creds) => Some(&creds.keys), + Self::Link(_, _) => None, } } } @@ -571,7 +558,7 @@ mod tests { impl console::Api for Auth { async fn get_role_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &super::ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) @@ -579,7 +566,7 @@ mod tests { async fn get_allowed_ips_and_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &super::ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError> { @@ -591,7 +578,7 @@ mod tests { async fn wake_compute( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &super::ComputeUserInfo, ) -> Result { unimplemented!() @@ -665,7 +652,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -717,11 +704,13 @@ mod tests { _ => panic!("wrong message"), } }); - let endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); let _creds = auth_quirks( - &mut ctx, + &ctx, &api, user_info, &mut stream, @@ -740,7 +729,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -767,11 +756,13 @@ mod tests { frontend::password_message(b"my-secret-password", &mut write).unwrap(); client.write_all(&write).await.unwrap(); }); - let endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); let _creds = auth_quirks( - &mut ctx, + &ctx, &api, user_info, &mut stream, @@ -790,7 +781,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -818,11 +809,13 @@ mod tests { client.write_all(&write).await.unwrap(); }); - let endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); let creds = auth_quirks( - &mut ctx, + &ctx, &api, user_info, &mut stream, diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index b98fa63120..285fa29428 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -12,7 +12,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; pub(super) async fn authenticate( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, creds: ComputeUserInfo, client: &mut PqStream>, config: &'static AuthenticationConfig, @@ -27,7 +27,7 @@ pub(super) async fn authenticate( } AuthSecret::Scram(secret) => { info!("auth endpoint chooses SCRAM"); - let scram = auth::Scram(&secret, &mut *ctx); + let scram = auth::Scram(&secret, ctx); let auth_outcome = tokio::time::timeout( config.scram_protocol_timeout, diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 6b0f5e1726..56921dd949 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -18,7 +18,7 @@ use tracing::{info, warn}; /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. pub async fn authenticate_cleartext( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, info: ComputeUserInfo, client: &mut stream::PqStream>, secret: AuthSecret, @@ -28,7 +28,7 @@ pub async fn authenticate_cleartext( ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let ep = EndpointIdInt::from(&info.endpoint); @@ -60,7 +60,7 @@ pub async fn authenticate_cleartext( /// Similar to [`authenticate_cleartext`], but there's a specific password format, /// and passwords are not yet validated (we don't know how to validate them!) pub async fn password_hack_no_authentication( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, ) -> auth::Result { @@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication( ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let payload = AuthFlow::new(client) .begin(auth::PasswordHack) diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs new file mode 100644 index 0000000000..e021a7e23f --- /dev/null +++ b/proxy/src/auth/backend/jwt.rs @@ -0,0 +1,556 @@ +use std::{future::Future, sync::Arc, time::Duration}; + +use anyhow::{bail, ensure, Context}; +use arc_swap::ArcSwapOption; +use dashmap::DashMap; +use jose_jwk::crypto::KeyInfo; +use signature::Verifier; +use tokio::time::Instant; + +use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt}; + +// TODO(conrad): make these configurable. +const MIN_RENEW: Duration = Duration::from_secs(30); +const AUTO_RENEW: Duration = Duration::from_secs(300); +const MAX_RENEW: Duration = Duration::from_secs(3600); +const MAX_JWK_BODY_SIZE: usize = 64 * 1024; + +/// How to get the JWT auth rules +pub trait FetchAuthRules: Clone + Send + Sync + 'static { + fn fetch_auth_rules(&self) -> impl Future> + Send; +} + +#[derive(Clone)] +struct FetchAuthRulesFromCplane { + #[allow(dead_code)] + endpoint: EndpointIdInt, +} + +impl FetchAuthRules for FetchAuthRulesFromCplane { + async fn fetch_auth_rules(&self) -> anyhow::Result { + Err(anyhow::anyhow!("not yet implemented")) + } +} + +pub struct AuthRules { + jwks_urls: Vec, +} + +#[derive(Default)] +pub struct JwkCache { + client: reqwest::Client, + + map: DashMap>, +} + +pub struct JwkCacheEntryLock { + cached: ArcSwapOption, + lookup: tokio::sync::Semaphore, +} + +impl Default for JwkCacheEntryLock { + fn default() -> Self { + JwkCacheEntryLock { + cached: ArcSwapOption::empty(), + lookup: tokio::sync::Semaphore::new(1), + } + } +} + +pub struct JwkCacheEntry { + /// Should refetch at least every hour to verify when old keys have been removed. + /// Should refetch when new key IDs are seen only every 5 minutes or so + last_retrieved: Instant, + + /// cplane will return multiple JWKs urls that we need to scrape. + key_sets: ahash::HashMap, +} + +impl JwkCacheEntryLock { + async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { + JwkRenewalPermit::acquire_permit(self).await + } + + fn try_acquire_permit<'a>(self: &'a Arc) -> Option> { + JwkRenewalPermit::try_acquire_permit(self) + } + + async fn renew_jwks( + &self, + _permit: JwkRenewalPermit<'_>, + client: &reqwest::Client, + auth_rules: &F, + ) -> anyhow::Result> { + // double check that no one beat us to updating the cache. + let now = Instant::now(); + let guard = self.cached.load_full(); + if let Some(cached) = guard { + let last_update = now.duration_since(cached.last_retrieved); + if last_update < Duration::from_secs(300) { + return Ok(cached); + } + } + + let rules = auth_rules.fetch_auth_rules().await?; + let mut key_sets = ahash::HashMap::with_capacity_and_hasher( + rules.jwks_urls.len(), + ahash::RandomState::new(), + ); + // TODO(conrad): run concurrently + // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284) + for url in rules.jwks_urls { + let req = client.get(url.clone()); + // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`. + match req.send().await.and_then(|r| r.error_for_status()) { + // todo: should we re-insert JWKs if we want to keep this JWKs URL? + // I expect these failures would be quite sparse. + Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"), + Ok(r) => { + let resp: http::Response = r.into(); + match parse_json_body_with_limit::( + resp.into_body(), + MAX_JWK_BODY_SIZE, + ) + .await + { + Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"), + Ok(jwks) => { + key_sets.insert(url, jwks); + } + } + } + } + } + + let entry = Arc::new(JwkCacheEntry { + last_retrieved: now, + key_sets, + }); + self.cached.swap(Some(Arc::clone(&entry))); + + Ok(entry) + } + + async fn get_or_update_jwk_cache( + self: &Arc, + client: &reqwest::Client, + fetch: &F, + ) -> Result, anyhow::Error> { + let now = Instant::now(); + let guard = self.cached.load_full(); + + // if we have no cached JWKs, try and get some + let Some(cached) = guard else { + let permit = self.acquire_permit().await; + return self.renew_jwks(permit, client, fetch).await; + }; + + let last_update = now.duration_since(cached.last_retrieved); + + // check if the cached JWKs need updating. + if last_update > MAX_RENEW { + let permit = self.acquire_permit().await; + + // it's been too long since we checked the keys. wait for them to update. + return self.renew_jwks(permit, client, fetch).await; + } + + // every 5 minutes we should spawn a job to eagerly update the token. + if last_update > AUTO_RENEW { + if let Some(permit) = self.try_acquire_permit() { + tracing::debug!("JWKs should be renewed. Renewal permit acquired"); + let permit = permit.into_owned(); + let entry = self.clone(); + let client = client.clone(); + let fetch = fetch.clone(); + tokio::spawn(async move { + if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await { + tracing::warn!(error=?e, "could not fetch JWKs in background job"); + } + }); + } else { + tracing::debug!("JWKs should be renewed. Renewal permit already taken, skipping"); + } + } + + Ok(cached) + } + + async fn check_jwt( + self: &Arc, + jwt: String, + client: &reqwest::Client, + fetch: &F, + ) -> Result<(), anyhow::Error> { + // JWT compact form is defined to be + // || . || || . || + // where Signature = alg( || . || ); + + let (header_payload, signature) = jwt + .rsplit_once(".") + .context("Provided authentication token is not a valid JWT encoding")?; + let (header, _payload) = header_payload + .split_once(".") + .context("Provided authentication token is not a valid JWT encoding")?; + + let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD) + .context("Provided authentication token is not a valid JWT encoding")?; + let header = serde_json::from_slice::>(&header) + .context("Provided authentication token is not a valid JWT encoding")?; + + let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD) + .context("Provided authentication token is not a valid JWT encoding")?; + + ensure!(header.typ == "JWT"); + let kid = header.kid.context("missing key id")?; + + let mut guard = self.get_or_update_jwk_cache(client, fetch).await?; + + // get the key from the JWKs if possible. If not, wait for the keys to update. + let jwk = loop { + let jwk = guard + .key_sets + .values() + .flat_map(|jwks| &jwks.keys) + .find(|jwk| jwk.prm.kid.as_deref() == Some(kid)); + + match jwk { + Some(jwk) => break jwk, + None if guard.last_retrieved.elapsed() > MIN_RENEW => { + let permit = self.acquire_permit().await; + guard = self.renew_jwks(permit, client, fetch).await?; + } + _ => { + bail!("jwk not found"); + } + } + }; + + ensure!( + jwk.is_supported(&header.alg), + "signature algorithm not supported" + ); + + match &jwk.key { + jose_jwk::Key::Ec(key) => { + verify_ec_signature(header_payload.as_bytes(), &sig, key)?; + } + jose_jwk::Key::Rsa(key) => { + verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?; + } + key => bail!("unsupported key type {key:?}"), + }; + + // TODO(conrad): verify iss, exp, nbf, etc... + + Ok(()) + } +} + +impl JwkCache { + pub async fn check_jwt( + &self, + endpoint: EndpointIdInt, + jwt: String, + ) -> Result<(), anyhow::Error> { + // try with just a read lock first + let entry = self.map.get(&endpoint).as_deref().map(Arc::clone); + let entry = match entry { + Some(entry) => entry, + None => { + // acquire a write lock after to insert. + let entry = self.map.entry(endpoint).or_default(); + Arc::clone(&*entry) + } + }; + + let fetch = FetchAuthRulesFromCplane { endpoint }; + entry.check_jwt(jwt, &self.client, &fetch).await + } +} + +fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> { + use ecdsa::Signature; + use signature::Verifier; + + match key.crv { + jose_jwk::EcCurves::P256 => { + let pk = + p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?; + let key = p256::ecdsa::VerifyingKey::from(&pk); + let sig = Signature::from_slice(sig)?; + key.verify(data, &sig)?; + } + key => bail!("unsupported ec key type {key:?}"), + } + + Ok(()) +} + +fn verify_rsa_signature( + data: &[u8], + sig: &[u8], + key: &jose_jwk::Rsa, + alg: &Option, +) -> anyhow::Result<()> { + use jose_jwa::{Algorithm, Signing}; + use rsa::{ + pkcs1v15::{Signature, VerifyingKey}, + RsaPublicKey, + }; + + let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?; + + match alg { + Some(Algorithm::Signing(Signing::Rs256)) => { + let key = VerifyingKey::::new(key); + let sig = Signature::try_from(sig)?; + key.verify(data, &sig)?; + } + _ => bail!("invalid RSA signing algorithm"), + }; + + Ok(()) +} + +/// +#[derive(serde::Deserialize, serde::Serialize)] +struct JWTHeader<'a> { + /// must be "JWT" + typ: &'a str, + /// must be a supported alg + alg: jose_jwa::Algorithm, + /// key id, must be provided for our usecase + kid: Option<&'a str>, +} + +struct JwkRenewalPermit<'a> { + inner: Option>, +} + +enum JwkRenewalPermitInner<'a> { + Owned(Arc), + Borrowed(&'a Arc), +} + +impl JwkRenewalPermit<'_> { + fn into_owned(mut self) -> JwkRenewalPermit<'static> { + JwkRenewalPermit { + inner: self.inner.take().map(JwkRenewalPermitInner::into_owned), + } + } + + async fn acquire_permit(from: &Arc) -> JwkRenewalPermit<'_> { + match from.lookup.acquire().await { + Ok(permit) => { + permit.forget(); + JwkRenewalPermit { + inner: Some(JwkRenewalPermitInner::Borrowed(from)), + } + } + Err(_) => panic!("semaphore should not be closed"), + } + } + + fn try_acquire_permit(from: &Arc) -> Option> { + match from.lookup.try_acquire() { + Ok(permit) => { + permit.forget(); + Some(JwkRenewalPermit { + inner: Some(JwkRenewalPermitInner::Borrowed(from)), + }) + } + Err(tokio::sync::TryAcquireError::NoPermits) => None, + Err(tokio::sync::TryAcquireError::Closed) => panic!("semaphore should not be closed"), + } + } +} + +impl JwkRenewalPermitInner<'_> { + fn into_owned(self) -> JwkRenewalPermitInner<'static> { + match self { + JwkRenewalPermitInner::Owned(p) => JwkRenewalPermitInner::Owned(p), + JwkRenewalPermitInner::Borrowed(p) => JwkRenewalPermitInner::Owned(Arc::clone(p)), + } + } +} + +impl Drop for JwkRenewalPermit<'_> { + fn drop(&mut self) { + let entry = match &self.inner { + None => return, + Some(JwkRenewalPermitInner::Owned(p)) => p, + Some(JwkRenewalPermitInner::Borrowed(p)) => *p, + }; + entry.lookup.add_permits(1); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::{future::IntoFuture, net::SocketAddr, time::SystemTime}; + + use base64::URL_SAFE_NO_PAD; + use bytes::Bytes; + use http::Response; + use http_body_util::Full; + use hyper1::service::service_fn; + use hyper_util::rt::TokioIo; + use rand::rngs::OsRng; + use signature::Signer; + use tokio::net::TcpListener; + + fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) { + let sk = p256::SecretKey::random(&mut OsRng); + let pk = sk.public_key().into(); + let jwk = jose_jwk::Jwk { + key: jose_jwk::Key::Ec(pk), + prm: jose_jwk::Parameters { + kid: Some(kid), + alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)), + ..Default::default() + }, + }; + (sk, jwk) + } + + fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) { + let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap(); + let pk = sk.to_public_key().into(); + let jwk = jose_jwk::Jwk { + key: jose_jwk::Key::Rsa(pk), + prm: jose_jwk::Parameters { + kid: Some(kid), + alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)), + ..Default::default() + }, + }; + (sk, jwk) + } + + fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String { + let header = JWTHeader { + typ: "JWT", + alg: jose_jwa::Algorithm::Signing(sig), + kid: Some(&kid), + }; + let body = typed_json::json! {{ + "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600, + }}; + + let header = + base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD); + let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD); + + format!("{header}.{body}") + } + + fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String { + use p256::ecdsa::{Signature, SigningKey}; + + let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256); + let sig: Signature = SigningKey::from(key).sign(payload.as_bytes()); + let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD); + + format!("{payload}.{sig}") + } + + fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String { + use rsa::pkcs1v15::SigningKey; + use rsa::signature::SignatureEncoding; + + let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256); + let sig = SigningKey::::new(key).sign(payload.as_bytes()); + let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD); + + format!("{payload}.{sig}") + } + + #[tokio::test] + async fn renew() { + let (rs1, jwk1) = new_rsa_jwk("1".into()); + let (rs2, jwk2) = new_rsa_jwk("2".into()); + let (ec1, jwk3) = new_ec_jwk("3".into()); + let (ec2, jwk4) = new_ec_jwk("4".into()); + + let jwt1 = new_rsa_jwt("1".into(), rs1); + let jwt2 = new_rsa_jwt("2".into(), rs2); + let jwt3 = new_ec_jwt("3".into(), ec1); + let jwt4 = new_ec_jwt("4".into(), ec2); + + let foo_jwks = jose_jwk::JwkSet { + keys: vec![jwk1, jwk3], + }; + let bar_jwks = jose_jwk::JwkSet { + keys: vec![jwk2, jwk4], + }; + + let service = service_fn(move |req| { + let foo_jwks = foo_jwks.clone(); + let bar_jwks = bar_jwks.clone(); + async move { + let jwks = match req.uri().path() { + "/foo" => &foo_jwks, + "/bar" => &bar_jwks, + _ => { + return Response::builder() + .status(404) + .body(Full::new(Bytes::new())); + } + }; + let body = serde_json::to_vec(jwks).unwrap(); + Response::builder() + .status(200) + .body(Full::new(Bytes::from(body))) + } + }); + + let listener = TcpListener::bind("0.0.0.0:0").await.unwrap(); + let server = hyper1::server::conn::http1::Builder::new(); + let addr = listener.local_addr().unwrap(); + tokio::spawn(async move { + loop { + let (s, _) = listener.accept().await.unwrap(); + let serve = server.serve_connection(TokioIo::new(s), service.clone()); + tokio::spawn(serve.into_future()); + } + }); + + let client = reqwest::Client::new(); + + #[derive(Clone)] + struct Fetch(SocketAddr); + + impl FetchAuthRules for Fetch { + async fn fetch_auth_rules(&self) -> anyhow::Result { + Ok(AuthRules { + jwks_urls: vec![ + format!("http://{}/foo", self.0).parse().unwrap(), + format!("http://{}/bar", self.0).parse().unwrap(), + ], + }) + } + } + + let jwk_cache = Arc::new(JwkCacheEntryLock::default()); + + jwk_cache + .check_jwt(jwt1, &client, &Fetch(addr)) + .await + .unwrap(); + jwk_cache + .check_jwt(jwt2, &client, &Fetch(addr)) + .await + .unwrap(); + jwk_cache + .check_jwt(jwt3, &client, &Fetch(addr)) + .await + .unwrap(); + jwk_cache + .check_jwt(jwt4, &client, &Fetch(addr)) + .await + .unwrap(); + } +} diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 5932e1337c..95f4614736 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String { } pub(super) async fn authenticate( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index d06f5614f1..849e7d65e8 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -84,15 +84,17 @@ pub fn endpoint_sni( impl ComputeUserInfoMaybeEndpoint { pub fn parse( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, params: &StartupMessageParams, sni: Option<&str>, common_names: Option<&HashSet>, ) -> Result { - use ComputeUserInfoParseError::*; - // Some parameters are stored in the startup message. - let get_param = |key| params.get(key).ok_or(MissingKey(key)); + let get_param = |key| { + params + .get(key) + .ok_or(ComputeUserInfoParseError::MissingKey(key)) + }; let user: RoleName = get_param("user")?.into(); // Project name might be passed via PG's command-line options. @@ -122,11 +124,14 @@ impl ComputeUserInfoMaybeEndpoint { let endpoint = match (endpoint_option, endpoint_from_domain) { // Invariant: if we have both project name variants, they should match. (Some(option), Some(domain)) if option != domain => { - Some(Err(InconsistentProjectNames { domain, option })) + Some(Err(ComputeUserInfoParseError::InconsistentProjectNames { + domain, + option, + })) } // Invariant: project name may not contain certain characters. (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) { - false => Err(MalformedProjectName(name)), + false => Err(ComputeUserInfoParseError::MalformedProjectName(name)), true => Ok(name), }), } @@ -186,7 +191,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern { impl<'de> serde::de::Visitor<'de> for StrVisitor { type Value = IpPattern; - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask") } @@ -249,8 +254,8 @@ mod tests { fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -264,8 +269,8 @@ mod tests { ("database", "world"), // should be ignored ("foo", "bar"), // should be ignored ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -279,9 +284,9 @@ mod tests { let sni = Some("foo.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("foo")); assert_eq!(user_info.options.get_cache_key("foo"), "foo"); @@ -296,8 +301,8 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -311,8 +316,8 @@ mod tests { ("options", "-ckey=1 endpoint=bar -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -329,8 +334,8 @@ mod tests { ), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -344,8 +349,8 @@ mod tests { ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -359,9 +364,9 @@ mod tests { let sni = Some("baz.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("baz")); @@ -374,16 +379,16 @@ mod tests { let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.a.com"); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); Ok(()) @@ -397,10 +402,9 @@ mod tests { let sni = Some("second.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); - let err = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); + let ctx = RequestMonitoring::test(); + let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -417,10 +421,9 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); - let mut ctx = RequestMonitoring::test(); - let err = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); + let ctx = RequestMonitoring::test(); + let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { UnknownCommonName { cn } => { assert_eq!(cn, "localhost"); @@ -438,9 +441,9 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("project")); assert_eq!( user_info.options.get_cache_key("project"), diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 59d1ac17f4..acf7b4f6b6 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -27,7 +27,7 @@ pub trait AuthMethod { pub struct Begin; /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. -pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring); +pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring); impl AuthMethod for Scram<'_> { #[inline(always)] @@ -155,7 +155,7 @@ impl AuthFlow<'_, S, Scram<'_>> { let Scram(secret, ctx) = self.state; // pause the timer while we communicate with the client - let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; @@ -168,10 +168,8 @@ impl AuthFlow<'_, S, Scram<'_>> { } match sasl.method { - SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256), - SCRAM_SHA_256_PLUS => { - ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus) - } + SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256), + SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus), _ => {} } info!("client chooses {}", sasl.method); diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index e1674049a6..1038fa5116 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -10,7 +10,7 @@ use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; use proxy::metrics::{Metrics, ThreadPoolMetrics}; -use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled}; +use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; use rustls::pki_types::PrivateKeyDer; use tokio::net::TcpListener; @@ -205,7 +205,7 @@ async fn task_main( const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; async fn ssl_handshake( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, raw_stream: S, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -216,10 +216,11 @@ async fn ssl_handshake( use pq_proto::FeStartupPacket::*; match msg { - SslRequest => { + SslRequest { direct: false } => { stream .write_message(&pq_proto::BeMessage::EncryptionResponse(true)) .await?; + // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. @@ -255,13 +256,13 @@ async fn ssl_handshake( } async fn handle_client( - mut ctx: RequestMonitoring, + ctx: RequestMonitoring, dest_suffix: Arc, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { - let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?; + let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?; // Cut off first part of the SNI domain // We receive required destination details in the format of @@ -286,7 +287,10 @@ async fn handle_client( // Starting from here we only proxy the client's traffic. info!("performing the proxy pass..."); - let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?; - Ok(()) + match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await { + Ok(_) => Ok(()), + Err(ErrorSource::Client(err)) => Err(err).context("client"), + Err(ErrorSource::Compute(err)) => Err(err).context("compute"), + } } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index dffebf5580..d83a1f3bcf 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -5,6 +5,7 @@ use aws_config::meta::region::RegionProviderChain; use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; +use aws_config::Region; use futures::future::Either; use proxy::auth; use proxy::auth::backend::AuthRateLimiter; @@ -22,7 +23,9 @@ use proxy::http; use proxy::http::health_server::AppMetrics; use proxy::metrics::Metrics; use proxy::rate_limiter::EndpointRateLimiter; +use proxy::rate_limiter::LeakyBucketConfig; use proxy::rate_limiter::RateBucketInfo; +use proxy::rate_limiter::WakeComputeRateLimiter; use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use proxy::redis::elasticache; @@ -35,6 +38,7 @@ use proxy::usage_metrics; use anyhow::bail; use proxy::config::{self, ProxyConfig}; use proxy::serverless; +use remote_storage::RemoteStorageConfig; use std::net::SocketAddr; use std::pin::pin; use std::sync::Arc; @@ -169,12 +173,12 @@ struct ProxyCliArgs { /// cache for `role_secret` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] role_secret_cache: String, - /// disable ip check for http requests. If it is too time consuming, it could be turned off. - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - disable_ip_check_for_http: bool, /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) #[clap(long)] redis_notifications: Option, + /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". + #[clap(long, default_value = "irsa")] + redis_auth_type: String, /// redis host for streaming connections (might be different from the notifications host) #[clap(long)] redis_host: Option, @@ -205,8 +209,8 @@ struct ProxyCliArgs { /// remote storage configuration for backup metric collection /// Encoded as toml (same format as pageservers), eg /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, default_value = "{}")] - metric_backup_collection_remote_storage: String, + #[clap(long, value_parser = remote_storage_from_toml)] + metric_backup_collection_remote_storage: Option, /// chunk size for backup metric collection /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. #[clap(long, default_value = "4194304")] @@ -284,9 +288,10 @@ async fn main() -> anyhow::Result<()> { let config = build_config(&args)?; info!("Authentication backend: {}", config.auth_backend); - info!("Using region: {}", config.aws_region); + info!("Using region: {}", args.aws_region); - let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed + let region_provider = + RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone())); let provider_conf = ProviderConfig::without_region().with_region(region_provider.region().await); let aws_credentials_provider = { @@ -312,30 +317,44 @@ async fn main() -> anyhow::Result<()> { }; let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new( elasticache::AWSIRSAConfig::new( - config.aws_region.clone(), + args.aws_region.clone(), args.redis_cluster_name, args.redis_user_id, ), aws_credentials_provider, )); - let regional_redis_client = match (args.redis_host, args.redis_port) { - (Some(host), Some(port)) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host, - port, - elasticache_credentials_provider.clone(), + let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { + ("plain", redis_url) => match redis_url { + None => { + bail!("plain auth requires redis_notifications to be set"); + } + Some(url) => Some( + ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()), ), - ), - (None, None) => { - warn!("Redis events from console are disabled"); - None - } + }, + ("irsa", _) => match (&args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host.to_string(), + port, + elasticache_credentials_provider.clone(), + ), + ), + (None, None) => { + warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }, _ => { - bail!("redis-host and redis-port must be specified together"); + bail!("unknown auth type given"); } }; + let redis_notifications_client = if let Some(url) = args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string())) } else { regional_redis_client.clone() }; @@ -356,11 +375,14 @@ async fn main() -> anyhow::Result<()> { let cancel_map = CancelMap::default(); + let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); + RateBucketInfo::validate(redis_rps_limit)?; + let redis_publisher = match ®ional_redis_client { Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( redis_publisher.clone(), args.region.clone(), - &config.redis_rps_limit, + redis_rps_limit, )?))), None => None, }; @@ -372,9 +394,24 @@ async fn main() -> anyhow::Result<()> { proxy::metrics::CancellationSource::FromClient, )); - let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); - RateBucketInfo::validate(&mut endpoint_rps_limit)?; - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); + // bit of a hack - find the min rps and max rps supported and turn it into + // leaky bucket config instead + let max = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .max_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.max); + let rps = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .min_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.rps); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { rps, max }, + 64, + )); // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) @@ -511,9 +548,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } let backup_metric_collection_config = config::MetricBackupCollectionConfig { interval: args.metric_backup_collection_interval, - remote_storage_config: remote_storage_from_toml( - &args.metric_backup_collection_remote_storage, - )?, + remote_storage_config: args.metric_backup_collection_remote_storage.clone(), chunk_size: args.metric_backup_collection_chunk_size, }; @@ -578,7 +613,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); RateBucketInfo::validate(&mut wake_compute_rps_limit)?; let wake_compute_endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit)); + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); let api = console::provider::neon::Api::new( endpoint, caches, @@ -623,7 +658,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { )?; let http_config = HttpConfig { - request_timeout: args.sql_over_http.sql_over_http_timeout, + accept_websockets: true, pool_options: GlobalConnPoolOptions { max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, @@ -643,9 +678,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, }; - let mut redis_rps_limit = args.redis_rps_limit.clone(); - RateBucketInfo::validate(&mut redis_rps_limit)?; - let config = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, @@ -654,11 +686,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { http_config, authentication_config, require_client_ip: args.require_client_ip, - disable_ip_check_for_http: args.disable_ip_check_for_http, - redis_rps_limit, handshake_timeout: args.handshake_timeout, region: args.region.clone(), - aws_region: args.aws_region.clone(), wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, connect_compute_locks, connect_to_compute_retry_config: config::RetryConfig::parse( diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs index bc1c37512b..82c78e3eb2 100644 --- a/proxy/src/cache/common.rs +++ b/proxy/src/cache/common.rs @@ -24,7 +24,7 @@ impl Cache for &C { type LookupInfo = C::LookupInfo; fn invalidate(&self, info: &Self::LookupInfo) { - C::invalidate(self, info) + C::invalidate(self, info); } } @@ -53,6 +53,13 @@ impl Cached { ) } + pub fn map(self, f: impl FnOnce(V) -> U) -> Cached { + Cached { + token: self.token, + value: f(self.value), + } + } + /// Drop this entry from a cache if it's still there. pub fn invalidate(self) -> V { if let Some((cache, info)) = &self.token { diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 4bc10a6020..8c851790c2 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -68,7 +68,7 @@ impl EndpointsCache { ready: AtomicBool::new(false), } } - pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool { + pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool { if !self.ready.load(Ordering::Acquire) { return true; } diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 3b21381bb9..07fad56643 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -58,13 +58,15 @@ impl Cache for TimedLru { type LookupInfo = LookupInfo; fn invalidate(&self, info: &Self::LookupInfo) { - self.invalidate_raw(info) + self.invalidate_raw(info); } } struct Entry { created_at: Instant, expires_at: Instant, + ttl: Duration, + update_ttl_on_retrieval: bool, value: T, } @@ -122,7 +124,6 @@ impl TimedLru { Q: Hash + Eq + ?Sized, { let now = Instant::now(); - let deadline = now.checked_add(self.ttl).expect("time overflow"); // Do costly things before taking the lock. let mut cache = self.cache.lock(); @@ -142,7 +143,8 @@ impl TimedLru { let (created_at, expires_at) = (entry.created_at, entry.expires_at); // Update the deadline and the entry's position in the LRU list. - if self.update_ttl_on_retrieval { + let deadline = now.checked_add(raw_entry.get().ttl).expect("time overflow"); + if raw_entry.get().update_ttl_on_retrieval { raw_entry.get_mut().expires_at = deadline; } raw_entry.to_back(); @@ -162,12 +164,27 @@ impl TimedLru { /// existed, return the previous value and its creation timestamp. #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] fn insert_raw(&self, key: K, value: V) -> (Instant, Option) { + self.insert_raw_ttl(key, value, self.ttl, self.update_ttl_on_retrieval) + } + + /// Insert an entry to the cache. If an entry with the same key already + /// existed, return the previous value and its creation timestamp. + #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] + fn insert_raw_ttl( + &self, + key: K, + value: V, + ttl: Duration, + update: bool, + ) -> (Instant, Option) { let created_at = Instant::now(); - let expires_at = created_at.checked_add(self.ttl).expect("time overflow"); + let expires_at = created_at.checked_add(ttl).expect("time overflow"); let entry = Entry { created_at, expires_at, + ttl, + update_ttl_on_retrieval: update, value, }; @@ -190,6 +207,21 @@ impl TimedLru { } impl TimedLru { + pub fn insert_ttl(&self, key: K, value: V, ttl: Duration) { + self.insert_raw_ttl(key, value, ttl, false); + } + + pub fn insert_unit(&self, key: K, value: V) -> (Option, Cached<&Self, ()>) { + let (created_at, old) = self.insert_raw(key.clone(), value); + + let cached = Cached { + token: Some((self, LookupInfo { created_at, key })), + value: (), + }; + + (old, cached) + } + pub fn insert(&self, key: K, value: V) -> (Option, Cached<&Self>) { let (created_at, old) = self.insert_raw(key.clone(), value.clone()); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index feb09d5638..c071a59d58 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -44,11 +44,10 @@ pub enum ConnectionError { impl UserFacingError for ConnectionError { fn to_string_client(&self) -> String { - use ConnectionError::*; match self { // This helps us drop irrelevant library-specific prefixes. // TODO: propagate severity level and other parameters. - Postgres(err) => match err.as_db_error() { + ConnectionError::Postgres(err) => match err.as_db_error() { Some(err) => { let msg = err.message(); @@ -62,8 +61,8 @@ impl UserFacingError for ConnectionError { } None => err.to_string(), }, - WakeComputeError(err) => err.to_string_client(), - TooManyConnectionAttempts(_) => { + ConnectionError::WakeComputeError(err) => err.to_string_client(), + ConnectionError::TooManyConnectionAttempts(_) => { "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() } _ => COULD_NOT_CONNECT.to_owned(), @@ -276,12 +275,12 @@ impl ConnCfg { /// Connect to a corresponding compute node. pub async fn connect( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, allow_self_signed_compute: bool, aux: MetricsAuxInfo, timeout: Duration, ) -> Result { - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (socket_addr, stream, host) = self.connect_raw(timeout).await?; drop(pause); @@ -304,14 +303,14 @@ impl ConnCfg { )?; // connect_raw() will not use TLS if sslmode is "disable" - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = self.0.connect_raw(stream, tls).await?; drop(pause); - tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); + tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); let stream = connection.stream.into_inner(); info!( - cold_start_info = ctx.cold_start_info.as_str(), + cold_start_info = ctx.cold_start_info().as_str(), "connected to compute node at {host} ({socket_addr}) sslmode={:?}", self.0.get_ssl_mode() ); @@ -330,7 +329,7 @@ impl ConnCfg { params, cancel_closure, aux, - _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol), + _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), }; Ok(connection) @@ -366,16 +365,16 @@ static TLS_ROOTS: OnceCell> = OnceCell::new(); struct AcceptEverythingVerifier; impl ServerCertVerifier for AcceptEverythingVerifier { fn supported_verify_schemes(&self) -> Vec { - use rustls::SignatureScheme::*; + use rustls::SignatureScheme; // The schemes for which `SignatureScheme::supported_in_tls13` returns true. vec![ - ECDSA_NISTP521_SHA512, - ECDSA_NISTP384_SHA384, - ECDSA_NISTP256_SHA256, - RSA_PSS_SHA512, - RSA_PSS_SHA384, - RSA_PSS_SHA256, - ED25519, + SignatureScheme::ECDSA_NISTP521_SHA512, + SignatureScheme::ECDSA_NISTP384_SHA384, + SignatureScheme::ECDSA_NISTP256_SHA256, + SignatureScheme::RSA_PSS_SHA512, + SignatureScheme::RSA_PSS_SHA384, + SignatureScheme::RSA_PSS_SHA256, + SignatureScheme::ED25519, ] } fn verify_server_cert( diff --git a/proxy/src/config.rs b/proxy/src/config.rs index f4707a33aa..a280aa88ce 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -31,11 +31,8 @@ pub struct ProxyConfig { pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, - pub disable_ip_check_for_http: bool, - pub redis_rps_limit: Vec, pub region: String, pub handshake_timeout: Duration, - pub aws_region: String, pub wake_compute_retry_config: RetryConfig, pub connect_compute_locks: ApiLocks, pub connect_to_compute_retry_config: RetryConfig, @@ -55,7 +52,7 @@ pub struct TlsConfig { } pub struct HttpConfig { - pub request_timeout: tokio::time::Duration, + pub accept_websockets: bool, pub pool_options: GlobalConnPoolOptions, pub cancel_set: CancelSet, pub client_conn_threshold: u64, @@ -75,6 +72,9 @@ impl TlsConfig { } } +/// +pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql"; + /// Configure TLS for the main endpoint. pub fn configure_tls( key_path: &str, @@ -111,16 +111,17 @@ pub fn configure_tls( let cert_resolver = Arc::new(cert_resolver); // allow TLS 1.2 to be compatible with older client libraries - let config = rustls::ServerConfig::builder_with_protocol_versions(&[ + let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[ &rustls::version::TLS13, &rustls::version::TLS12, ]) .with_no_client_auth() - .with_cert_resolver(cert_resolver.clone()) - .into(); + .with_cert_resolver(cert_resolver.clone()); + + config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()]; Ok(TlsConfig { - config, + config: Arc::new(config), common_names, cert_resolver, }) @@ -155,7 +156,7 @@ pub enum TlsServerEndPoint { } impl TlsServerEndPoint { - pub fn new(cert: &CertificateDer) -> anyhow::Result { + pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result { let sha256_oids = [ // I'm explicitly not adding MD5 or SHA1 here... They're bad. oid_registry::OID_SIG_ECDSA_WITH_SHA256, @@ -278,7 +279,7 @@ impl CertResolver { impl rustls::server::ResolvesServerCert for CertResolver { fn resolve( &self, - client_hello: rustls::server::ClientHello, + client_hello: rustls::server::ClientHello<'_>, ) -> Option> { self.resolve(client_hello.server_name()).map(|x| x.0) } @@ -399,15 +400,11 @@ impl FromStr for EndpointCacheConfig { #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub interval: Duration, - pub remote_storage_config: OptRemoteStorageConfig, + pub remote_storage_config: Option, pub chunk_size: usize, } -/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get -/// runtime type errors from the value parser we use. -pub type OptRemoteStorageConfig = Option; - -pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { +pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { RemoteStorageConfig::from_toml(&s.parse()?) } @@ -563,7 +560,7 @@ impl RetryConfig { match key { "num_retries" => num_retries = Some(value.parse()?), "base_retry_wait_duration" => { - base_retry_wait_duration = Some(humantime::parse_duration(value)?) + base_retry_wait_duration = Some(humantime::parse_duration(value)?); } "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?), unknown => bail!("unknown key: {unknown}"), diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 3b7d681a41..ac66e116d0 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -5,11 +5,11 @@ use std::fmt::{self, Display}; use crate::auth::IpPattern; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; -use crate::proxy::retry::ShouldRetry; +use crate::proxy::retry::CouldRetry; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct ConsoleError { pub error: Box, #[serde(skip)] @@ -22,16 +22,15 @@ impl ConsoleError { self.status .as_ref() .and_then(|s| s.details.error_info.as_ref()) - .map(|e| e.reason) - .unwrap_or(Reason::Unknown) + .map_or(Reason::Unknown, |e| e.reason) } + pub fn get_user_facing_message(&self) -> String { use super::provider::errors::REQUEST_FAILED; self.status .as_ref() .and_then(|s| s.details.user_facing_message.as_ref()) - .map(|m| m.message.clone().into()) - .unwrap_or_else(|| { + .map_or_else(|| { // Ask @neondatabase/control-plane for review before adding more. match self.http_status_code { http::StatusCode::NOT_FOUND => { @@ -48,80 +47,59 @@ impl ConsoleError { } _ => REQUEST_FAILED.to_owned(), } - }) + }, |m| m.message.clone().into()) } } impl Display for ConsoleError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let msg = self + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let msg: &str = self .status .as_ref() .and_then(|s| s.details.user_facing_message.as_ref()) - .map(|m| m.message.as_ref()) - .unwrap_or_else(|| &self.error); - write!(f, "{}", msg) + .map_or_else(|| self.error.as_ref(), |m| m.message.as_ref()); + write!(f, "{msg}") } } -impl ShouldRetry for ConsoleError { +impl CouldRetry for ConsoleError { fn could_retry(&self) -> bool { - if self.status.is_none() || self.status.as_ref().unwrap().details.retry_info.is_none() { - // retry some temporary failures because the compute was in a bad state - // (bad request can be returned when the endpoint was in transition) - return match &self { - ConsoleError { - http_status_code: http::StatusCode::BAD_REQUEST, - .. - } => true, - // don't retry when quotas are exceeded - ConsoleError { - http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY, - ref error, - .. - } => !error.contains("compute time quota of non-primary branches is exceeded"), - // locked can be returned when the endpoint was in transition - // or when quotas are exceeded. don't retry when quotas are exceeded - ConsoleError { - http_status_code: http::StatusCode::LOCKED, - ref error, - .. - } => { - !error.contains("quota exceeded") - && !error.contains("the limit for current plan reached") - } - _ => false, - }; + // If the error message does not have a status, + // the error is unknown and probably should not retry automatically + let Some(status) = &self.status else { + return false; + }; + + // retry if the retry info is set. + if status.details.retry_info.is_some() { + return true; } - // retry if the response has a retry delay - if let Some(retry_info) = self - .status - .as_ref() - .and_then(|s| s.details.retry_info.as_ref()) - { - retry_info.retry_delay_ms > 0 - } else { - false - } + // if no retry info set, attempt to use the error code to guess the retry state. + let reason = status + .details + .error_info + .map_or(Reason::Unknown, |e| e.reason); + + reason.can_retry() } } -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct Status { pub code: Box, pub message: Box, pub details: Details, } -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct Details { pub error_info: Option, pub retry_info: Option, pub user_facing_message: Option, } -#[derive(Debug, Deserialize)] +#[derive(Copy, Clone, Debug, Deserialize)] pub struct ErrorInfo { pub reason: Reason, // Schema could also have `metadata` field, but it's not structured. Skip it for now. @@ -129,30 +107,59 @@ pub struct ErrorInfo { #[derive(Clone, Copy, Debug, Deserialize, Default)] pub enum Reason { + /// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles. #[serde(rename = "ROLE_PROTECTED")] RoleProtected, + /// ResourceNotFound indicates that a resource (project, endpoint, branch, etc.) wasn't found, + /// usually due to the provided ID not being correct or because the subject doesn't have enough permissions to + /// access the requested resource. + /// Prefer a more specific reason if possible, e.g., ProjectNotFound, EndpointNotFound, etc. #[serde(rename = "RESOURCE_NOT_FOUND")] ResourceNotFound, + /// ProjectNotFound indicates that the project wasn't found, usually due to the provided ID not being correct, + /// or that the subject doesn't have enough permissions to access the requested project. #[serde(rename = "PROJECT_NOT_FOUND")] ProjectNotFound, + /// EndpointNotFound indicates that the endpoint wasn't found, usually due to the provided ID not being correct, + /// or that the subject doesn't have enough permissions to access the requested endpoint. #[serde(rename = "ENDPOINT_NOT_FOUND")] EndpointNotFound, + /// BranchNotFound indicates that the branch wasn't found, usually due to the provided ID not being correct, + /// or that the subject doesn't have enough permissions to access the requested branch. #[serde(rename = "BRANCH_NOT_FOUND")] BranchNotFound, + /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded. #[serde(rename = "RATE_LIMIT_EXCEEDED")] RateLimitExceeded, + /// NonDefaultBranchComputeTimeExceeded indicates that the compute time quota of non-default branches has been + /// exceeded. #[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")] - NonPrimaryBranchComputeTimeExceeded, + NonDefaultBranchComputeTimeExceeded, + /// ActiveTimeQuotaExceeded indicates that the active time quota was exceeded. #[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")] ActiveTimeQuotaExceeded, + /// ComputeTimeQuotaExceeded indicates that the compute time quota was exceeded. #[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")] ComputeTimeQuotaExceeded, + /// WrittenDataQuotaExceeded indicates that the written data quota was exceeded. #[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")] WrittenDataQuotaExceeded, + /// DataTransferQuotaExceeded indicates that the data transfer quota was exceeded. #[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")] DataTransferQuotaExceeded, + /// LogicalSizeQuotaExceeded indicates that the logical size quota was exceeded. #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")] LogicalSizeQuotaExceeded, + /// RunningOperations indicates that the project already has some running operations + /// and scheduling of new ones is prohibited. + #[serde(rename = "RUNNING_OPERATIONS")] + RunningOperations, + /// ConcurrencyLimitReached indicates that the concurrency limit for an action was reached. + #[serde(rename = "CONCURRENCY_LIMIT_REACHED")] + ConcurrencyLimitReached, + /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken. + #[serde(rename = "LOCK_ALREADY_TAKEN")] + LockAlreadyTaken, #[default] #[serde(other)] Unknown, @@ -168,14 +175,42 @@ impl Reason { | Reason::BranchNotFound ) } + + pub fn can_retry(&self) -> bool { + match self { + // do not retry role protected errors + // not a transitive error + Reason::RoleProtected => false, + // on retry, it will still not be found + Reason::ResourceNotFound + | Reason::ProjectNotFound + | Reason::EndpointNotFound + | Reason::BranchNotFound => false, + // we were asked to go away + Reason::RateLimitExceeded + | Reason::NonDefaultBranchComputeTimeExceeded + | Reason::ActiveTimeQuotaExceeded + | Reason::ComputeTimeQuotaExceeded + | Reason::WrittenDataQuotaExceeded + | Reason::DataTransferQuotaExceeded + | Reason::LogicalSizeQuotaExceeded => false, + // transitive error. control plane is currently busy + // but might be ready soon + Reason::RunningOperations + | Reason::ConcurrencyLimitReached + | Reason::LockAlreadyTaken => true, + // unknown error. better not retry it. + Reason::Unknown => false, + } + } } -#[derive(Debug, Deserialize)] +#[derive(Copy, Clone, Debug, Deserialize)] pub struct RetryInfo { pub retry_delay_ms: u64, } -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct UserFacingMessage { pub message: Box, } @@ -249,7 +284,7 @@ pub struct DatabaseInfo { // Manually implement debug to omit sensitive info. impl fmt::Debug for DatabaseInfo { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("DatabaseInfo") .field("host", &self.host) .field("port", &self.port) @@ -336,7 +371,7 @@ mod tests { } } }); - let _: KickSession = serde_json::from_str(&json.to_string())?; + let _: KickSession<'_> = serde_json::from_str(&json.to_string())?; Ok(()) } diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index c7a2d467c0..82d5033aab 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -6,8 +6,9 @@ use anyhow::Context; use once_cell::sync::Lazy; use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use std::{convert::Infallible, future}; +use std::convert::Infallible; use tokio::net::{TcpListener, TcpStream}; +use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, Instrument}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); @@ -67,7 +68,9 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result { async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?; - pgbackend.run(&mut MgmtHandler, future::pending::<()>).await + pgbackend + .run(&mut MgmtHandler, &CancellationToken::new()) + .await } /// A message received by `mgmt` when a compute node is ready. @@ -90,7 +93,8 @@ impl postgres_backend::Handler for MgmtHandler { } fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> { - let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?; + let resp: KickSession<'_> = + serde_json::from_str(query).context("Failed to parse query as json")?; let span = info_span!("event", session_id = resp.session_id); let _enter = span.enter(); diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 915c2ee7a6..cc2ee10062 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -2,7 +2,7 @@ pub mod mock; pub mod neon; -use super::messages::MetricsAuxInfo; +use super::messages::{ConsoleError, MetricsAuxInfo}; use crate::{ auth::{ backend::{ComputeCredentialKeys, ComputeUserInfo}, @@ -25,9 +25,9 @@ use tracing::info; pub mod errors { use crate::{ - console::messages::{self, ConsoleError}, - error::{io_error, ReportableError, UserFacingError}, - proxy::retry::ShouldRetry, + console::messages::{self, ConsoleError, Reason}, + error::{io_error, ErrorKind, ReportableError, UserFacingError}, + proxy::retry::CouldRetry, }; use thiserror::Error; @@ -51,21 +51,19 @@ pub mod errors { impl ApiError { /// Returns HTTP status code if it's the reason for failure. pub fn get_reason(&self) -> messages::Reason { - use ApiError::*; match self { - Console(e) => e.get_reason(), - _ => messages::Reason::Unknown, + ApiError::Console(e) => e.get_reason(), + ApiError::Transport(_) => messages::Reason::Unknown, } } } impl UserFacingError for ApiError { fn to_string_client(&self) -> String { - use ApiError::*; match self { // To minimize risks, only select errors are forwarded to users. - Console(c) => c.get_user_facing_message(), - _ => REQUEST_FAILED.to_owned(), + ApiError::Console(c) => c.get_user_facing_message(), + ApiError::Transport(_) => REQUEST_FAILED.to_owned(), } } } @@ -73,62 +71,59 @@ pub mod errors { impl ReportableError for ApiError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { - ApiError::Console(e) => { - use crate::error::ErrorKind::*; - match e.get_reason() { - crate::console::messages::Reason::RoleProtected => User, - crate::console::messages::Reason::ResourceNotFound => User, - crate::console::messages::Reason::ProjectNotFound => User, - crate::console::messages::Reason::EndpointNotFound => User, - crate::console::messages::Reason::BranchNotFound => User, - crate::console::messages::Reason::RateLimitExceeded => ServiceRateLimit, - crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => { - User + ApiError::Console(e) => match e.get_reason() { + Reason::RoleProtected => ErrorKind::User, + Reason::ResourceNotFound => ErrorKind::User, + Reason::ProjectNotFound => ErrorKind::User, + Reason::EndpointNotFound => ErrorKind::User, + Reason::BranchNotFound => ErrorKind::User, + Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit, + Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::User, + Reason::ActiveTimeQuotaExceeded => ErrorKind::User, + Reason::ComputeTimeQuotaExceeded => ErrorKind::User, + Reason::WrittenDataQuotaExceeded => ErrorKind::User, + Reason::DataTransferQuotaExceeded => ErrorKind::User, + Reason::LogicalSizeQuotaExceeded => ErrorKind::User, + Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane, + Reason::LockAlreadyTaken => ErrorKind::ControlPlane, + Reason::RunningOperations => ErrorKind::ControlPlane, + Reason::Unknown => match &e { + ConsoleError { + http_status_code: + http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, + .. + } => crate::error::ErrorKind::User, + ConsoleError { + http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY, + error, + .. + } if error + .contains("compute time quota of non-primary branches is exceeded") => + { + crate::error::ErrorKind::User } - crate::console::messages::Reason::ActiveTimeQuotaExceeded => User, - crate::console::messages::Reason::ComputeTimeQuotaExceeded => User, - crate::console::messages::Reason::WrittenDataQuotaExceeded => User, - crate::console::messages::Reason::DataTransferQuotaExceeded => User, - crate::console::messages::Reason::LogicalSizeQuotaExceeded => User, - crate::console::messages::Reason::Unknown => match &e { - ConsoleError { - http_status_code: - http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, - .. - } => crate::error::ErrorKind::User, - ConsoleError { - http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY, - error, - .. - } if error.contains( - "compute time quota of non-primary branches is exceeded", - ) => - { - crate::error::ErrorKind::User - } - ConsoleError { - http_status_code: http::StatusCode::LOCKED, - error, - .. - } if error.contains("quota exceeded") - || error.contains("the limit for current plan reached") => - { - crate::error::ErrorKind::User - } - ConsoleError { - http_status_code: http::StatusCode::TOO_MANY_REQUESTS, - .. - } => crate::error::ErrorKind::ServiceRateLimit, - ConsoleError { .. } => crate::error::ErrorKind::ControlPlane, - }, - } - } + ConsoleError { + http_status_code: http::StatusCode::LOCKED, + error, + .. + } if error.contains("quota exceeded") + || error.contains("the limit for current plan reached") => + { + crate::error::ErrorKind::User + } + ConsoleError { + http_status_code: http::StatusCode::TOO_MANY_REQUESTS, + .. + } => crate::error::ErrorKind::ServiceRateLimit, + ConsoleError { .. } => crate::error::ErrorKind::ControlPlane, + }, + }, ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane, } } } - impl ShouldRetry for ApiError { + impl CouldRetry for ApiError { fn could_retry(&self) -> bool { match self { // retry some transport errors @@ -169,12 +164,11 @@ pub mod errors { impl UserFacingError for GetAuthInfoError { fn to_string_client(&self) -> String { - use GetAuthInfoError::*; match self { // We absolutely should not leak any secrets! - BadSecret => REQUEST_FAILED.to_owned(), + Self::BadSecret => REQUEST_FAILED.to_owned(), // However, API might return a meaningful error. - ApiError(e) => e.to_string_client(), + Self::ApiError(e) => e.to_string_client(), } } } @@ -182,8 +176,8 @@ pub mod errors { impl ReportableError for GetAuthInfoError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { - GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane, - GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane, + Self::BadSecret => crate::error::ErrorKind::ControlPlane, + Self::ApiError(_) => crate::error::ErrorKind::ControlPlane, } } } @@ -212,17 +206,16 @@ pub mod errors { impl UserFacingError for WakeComputeError { fn to_string_client(&self) -> String { - use WakeComputeError::*; match self { // We shouldn't show user the address even if it's broken. // Besides, user is unlikely to care about this detail. - BadComputeAddress(_) => REQUEST_FAILED.to_owned(), + Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(), // However, API might return a meaningful error. - ApiError(e) => e.to_string_client(), + Self::ApiError(e) => e.to_string_client(), - TooManyConnections => self.to_string(), + Self::TooManyConnections => self.to_string(), - TooManyConnectionAttempts(_) => { + Self::TooManyConnectionAttempts(_) => { "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() } } @@ -232,10 +225,21 @@ pub mod errors { impl ReportableError for WakeComputeError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { - WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane, - WakeComputeError::ApiError(e) => e.get_error_kind(), - WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit, - WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(), + Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane, + Self::ApiError(e) => e.get_error_kind(), + Self::TooManyConnections => crate::error::ErrorKind::RateLimit, + Self::TooManyConnectionAttempts(e) => e.get_error_kind(), + } + } + } + + impl CouldRetry for WakeComputeError { + fn could_retry(&self) -> bool { + match self { + Self::BadComputeAddress(_) => false, + Self::ApiError(e) => e.could_retry(), + Self::TooManyConnections => false, + Self::TooManyConnectionAttempts(_) => false, } } } @@ -280,7 +284,7 @@ pub struct NodeInfo { impl NodeInfo { pub async fn connect( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, timeout: Duration, ) -> Result { self.config @@ -305,8 +309,8 @@ impl NodeInfo { } } -pub type NodeInfoCache = TimedLru; -pub type CachedNodeInfo = Cached<&'static NodeInfoCache>; +pub type NodeInfoCache = TimedLru>>; +pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>; pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; @@ -318,20 +322,20 @@ pub(crate) trait Api { /// We still have to mock the scram to avoid leaking information that user doesn't exist. async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result; async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result; } @@ -351,47 +355,45 @@ pub enum ConsoleBackend { impl Api for ConsoleBackend { async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - use ConsoleBackend::*; match self { - Console(api) => api.get_role_secret(ctx, user_info).await, + Self::Console(api) => api.get_role_secret(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] - Postgres(api) => api.get_role_secret(ctx, user_info).await, + Self::Postgres(api) => api.get_role_secret(ctx, user_info).await, #[cfg(test)] - Test(_) => unreachable!("this function should never be called in the test backend"), + Self::Test(_) => { + unreachable!("this function should never be called in the test backend") + } } } async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { - use ConsoleBackend::*; match self { - Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + Self::Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] - Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + Self::Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, #[cfg(test)] - Test(api) => api.get_allowed_ips_and_secret(), + Self::Test(api) => api.get_allowed_ips_and_secret(), } } async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - use ConsoleBackend::*; - match self { - Console(api) => api.wake_compute(ctx, user_info).await, + Self::Console(api) => api.wake_compute(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] - Postgres(api) => api.wake_compute(ctx, user_info).await, + Self::Postgres(api) => api.wake_compute(ctx, user_info).await, #[cfg(test)] - Test(api) => api.wake_compute(), + Self::Test(api) => api.wake_compute(), } } } @@ -537,7 +539,7 @@ impl WakeComputePermit { !self.permit.is_disabled() } pub fn release(self, outcome: Outcome) { - self.permit.release(outcome) + self.permit.release(outcome); } pub fn release_result(self, res: Result) -> Result { match res { diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index cfe491f2aa..2093da7562 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -158,7 +158,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached( @@ -168,7 +168,7 @@ impl super::Api for Api { async fn get_allowed_ips_and_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { Ok(( @@ -182,7 +182,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &ComputeUserInfo, ) -> Result { self.do_wake_compute().map_ok(Cached::new_uncached).await diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 41bd2f4956..7eda238b66 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -9,24 +9,24 @@ use super::{ use crate::{ auth::backend::ComputeUserInfo, compute, - console::messages::ColdStartInfo, + console::messages::{ColdStartInfo, Reason}, http, metrics::{CacheOutcome, Metrics}, - rate_limiter::EndpointRateLimiter, + rate_limiter::WakeComputeRateLimiter, scram, EndpointCacheKey, }; use crate::{cache::Cached, context::RequestMonitoring}; use futures::TryFutureExt; -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; use tokio::time::Instant; use tokio_postgres::config::SslMode; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{debug, error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, pub locks: &'static ApiLocks, - pub wake_compute_endpoint_rate_limiter: Arc, + pub wake_compute_endpoint_rate_limiter: Arc, jwt: String, } @@ -36,7 +36,7 @@ impl Api { endpoint: http::Endpoint, caches: &'static ApiCaches, locks: &'static ApiLocks, - wake_compute_endpoint_rate_limiter: Arc, + wake_compute_endpoint_rate_limiter: Arc, ) -> Self { let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { Ok(v) => v, @@ -57,7 +57,7 @@ impl Api { async fn do_get_auth_info( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { if !self @@ -69,7 +69,7 @@ impl Api { info!("endpoint is not valid, skipping the request"); return Ok(AuthInfo::default()); } - let request_id = ctx.session_id.to_string(); + let request_id = ctx.session_id().to_string(); let application_name = ctx.console_application_name(); async { let request = self @@ -77,7 +77,7 @@ impl Api { .get("proxy_get_role_secret") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id)]) + .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", application_name.as_str()), ("project", user_info.endpoint.as_str()), @@ -87,7 +87,7 @@ impl Api { info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); @@ -130,10 +130,10 @@ impl Api { async fn do_wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let request_id = ctx.session_id.to_string(); + let request_id = ctx.session_id().to_string(); let application_name = ctx.console_application_name(); async { let mut request_builder = self @@ -141,7 +141,7 @@ impl Api { .get("proxy_wake_compute") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id)]) + .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", application_name.as_str()), ("project", user_info.endpoint.as_str()), @@ -156,7 +156,7 @@ impl Api { info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); @@ -192,7 +192,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { let normalized_ep = &user_info.endpoint.normalize(); @@ -226,7 +226,7 @@ impl super::Api for Api { async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { let normalized_ep = &user_info.endpoint.normalize(); @@ -268,31 +268,39 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { let key = user_info.endpoint_cache_key(); + macro_rules! check_cache { + () => { + if let Some(cached) = self.caches.node_info.get(&key) { + let (cached, info) = cached.take_value(); + let info = info.map_err(|c| { + info!(key = &*key, "found cached wake_compute error"); + WakeComputeError::ApiError(ApiError::Console(*c)) + })?; + + debug!(key = &*key, "found cached compute node info"); + ctx.set_project(info.aux.clone()); + return Ok(cached.map(|()| info)); + } + }; + } + // Every time we do a wakeup http request, the compute node will stay up // for some time (highly depends on the console's scale-to-zero policy); // The connection info remains the same during that period of time, // which means that we might cache it to reduce the load and latency. - if let Some(cached) = self.caches.node_info.get(&key) { - info!(key = &*key, "found cached compute node info"); - ctx.set_project(cached.aux.clone()); - return Ok(cached); - } + check_cache!(); let permit = self.locks.get_permit(&key).await?; // after getting back a permit - it's possible the cache was filled // double check if permit.should_check_cache() { - if let Some(cached) = self.caches.node_info.get(&key) { - info!(key = &*key, "found cached compute node info"); - ctx.set_project(cached.aux.clone()); - return Ok(cached); - } + check_cache!(); } // check rate limit @@ -300,23 +308,56 @@ impl super::Api for Api { .wake_compute_endpoint_rate_limiter .check(user_info.endpoint.normalize_intern(), 1) { - info!(key = &*key, "found cached compute node info"); return Err(WakeComputeError::TooManyConnections); } - let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?; - ctx.set_project(node.aux.clone()); - let cold_start_info = node.aux.cold_start_info; - info!("woken up a compute node"); + let node = permit.release_result(self.do_wake_compute(ctx, user_info).await); + match node { + Ok(node) => { + ctx.set_project(node.aux.clone()); + debug!(key = &*key, "created a cache entry for woken compute node"); - // store the cached node as 'warm' - node.aux.cold_start_info = ColdStartInfo::WarmCached; - let (_, mut cached) = self.caches.node_info.insert(key.clone(), node); - cached.aux.cold_start_info = cold_start_info; + let mut stored_node = node.clone(); + // store the cached node as 'warm_cached' + stored_node.aux.cold_start_info = ColdStartInfo::WarmCached; - info!(key = &*key, "created a cache entry for compute node info"); + let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node)); - Ok(cached) + Ok(cached.map(|()| node)) + } + Err(err) => match err { + WakeComputeError::ApiError(ApiError::Console(err)) => { + let Some(status) = &err.status else { + return Err(WakeComputeError::ApiError(ApiError::Console(err))); + }; + + let reason = status + .details + .error_info + .map_or(Reason::Unknown, |x| x.reason); + + // if we can retry this error, do not cache it. + if reason.can_retry() { + return Err(WakeComputeError::ApiError(ApiError::Console(err))); + } + + // at this point, we should only have quota errors. + debug!( + key = &*key, + "created a cache entry for the wake compute error" + ); + + self.caches.node_info.insert_ttl( + key, + Err(Box::new(err.clone())), + Duration::from_secs(30), + ); + + Err(WakeComputeError::ApiError(ApiError::Console(err))) + } + err => return Err(err), + }, + } } } diff --git a/proxy/src/context.rs b/proxy/src/context.rs index ff79ba8275..cafbdedc15 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -7,13 +7,14 @@ use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; use tracing::{field::display, info, info_span, Span}; +use try_lock::TryLock; use uuid::Uuid; use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, intern::{BranchIdInt, ProjectIdInt}, - metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol}, + metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting}, DbName, EndpointId, RoleName, }; @@ -28,7 +29,15 @@ pub static LOG_CHAN_DISCONNECT: OnceCell> /// /// This data should **not** be used for connection logic, only for observability and limiting purposes. /// All connection logic should instead use strongly typed state machines, not a bunch of Options. -pub struct RequestMonitoring { +pub struct RequestMonitoring( + /// To allow easier use of the ctx object, we have interior mutability. + /// I would typically use a RefCell but that would break the `Send` requirements + /// so we need something with thread-safety. `TryLock` is a cheap alternative + /// that offers similar semantics to a `RefCell` but with synchronisation. + TryLock, +); + +struct RequestMonitoringInner { pub peer_addr: IpAddr, pub session_id: Uuid, pub protocol: Protocol, @@ -85,7 +94,7 @@ impl RequestMonitoring { role = tracing::field::Empty, ); - Self { + let inner = RequestMonitoringInner { peer_addr, session_id, protocol, @@ -110,7 +119,9 @@ impl RequestMonitoring { disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), disconnect_timestamp: None, - } + }; + + Self(TryLock::new(inner)) } #[cfg(test)] @@ -119,48 +130,177 @@ impl RequestMonitoring { } pub fn console_application_name(&self) -> String { + let this = self.0.try_lock().expect("should not deadlock"); format!( "{}/{}", - self.application.as_deref().unwrap_or_default(), - self.protocol + this.application.as_deref().unwrap_or_default(), + this.protocol ) } - pub fn set_rejected(&mut self, rejected: bool) { - self.rejected = Some(rejected); + pub fn set_rejected(&self, rejected: bool) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.rejected = Some(rejected); } - pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { + pub fn set_cold_start_info(&self, info: ColdStartInfo) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_cold_start_info(info); + } + + pub fn set_db_options(&self, options: StartupMessageParams) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.set_application(options.get("application_name").map(SmolStr::from)); + if let Some(user) = options.get("user") { + this.set_user(user.into()); + } + if let Some(dbname) = options.get("database") { + this.set_dbname(dbname.into()); + } + + this.pg_options = Some(options); + } + + pub fn set_project(&self, x: MetricsAuxInfo) { + let mut this = self.0.try_lock().expect("should not deadlock"); + if this.endpoint_id.is_none() { + this.set_endpoint_id(x.endpoint_id.as_str().into()); + } + this.branch = Some(x.branch_id); + this.project = Some(x.project_id); + this.set_cold_start_info(x.cold_start_info); + } + + pub fn set_project_id(&self, project_id: ProjectIdInt) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.project = Some(project_id); + } + + pub fn set_endpoint_id(&self, endpoint_id: EndpointId) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_endpoint_id(endpoint_id); + } + + pub fn set_dbname(&self, dbname: DbName) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_dbname(dbname); + } + + pub fn set_user(&self, user: RoleName) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_user(user); + } + + pub fn set_auth_method(&self, auth_method: AuthMethod) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.auth_method = Some(auth_method); + } + + pub fn has_private_peer_addr(&self) -> bool { + self.0 + .try_lock() + .expect("should not deadlock") + .has_private_peer_addr() + } + + pub fn set_error_kind(&self, kind: ErrorKind) { + let mut this = self.0.try_lock().expect("should not deadlock"); + // Do not record errors from the private address to metrics. + if !this.has_private_peer_addr() { + Metrics::get().proxy.errors_total.inc(kind); + } + if let Some(ep) = &this.endpoint_id { + let metric = &Metrics::get().proxy.endpoints_affected_by_errors; + let label = metric.with_labels(kind); + metric.get_metric(label).measure(ep); + } + this.error_kind = Some(kind); + } + + pub fn set_success(&self) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.success = true; + } + + pub fn log_connect(&self) { + self.0 + .try_lock() + .expect("should not deadlock") + .log_connect(); + } + + pub fn protocol(&self) -> Protocol { + self.0.try_lock().expect("should not deadlock").protocol + } + + pub fn span(&self) -> Span { + self.0.try_lock().expect("should not deadlock").span.clone() + } + + pub fn session_id(&self) -> Uuid { + self.0.try_lock().expect("should not deadlock").session_id + } + + pub fn peer_addr(&self) -> IpAddr { + self.0.try_lock().expect("should not deadlock").peer_addr + } + + pub fn cold_start_info(&self) -> ColdStartInfo { + self.0 + .try_lock() + .expect("should not deadlock") + .cold_start_info + } + + pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> { + LatencyTimerPause { + ctx: self, + start: tokio::time::Instant::now(), + waiting_for, + } + } + + pub fn success(&self) { + self.0 + .try_lock() + .expect("should not deadlock") + .latency_timer + .success(); + } +} + +pub struct LatencyTimerPause<'a> { + ctx: &'a RequestMonitoring, + start: tokio::time::Instant, + waiting_for: Waiting, +} + +impl Drop for LatencyTimerPause<'_> { + fn drop(&mut self) { + self.ctx + .0 + .try_lock() + .expect("should not deadlock") + .latency_timer + .unpause(self.start, self.waiting_for); + } +} + +impl RequestMonitoringInner { + fn set_cold_start_info(&mut self, info: ColdStartInfo) { self.cold_start_info = info; self.latency_timer.cold_start_info(info); } - pub fn set_db_options(&mut self, options: StartupMessageParams) { - self.set_application(options.get("application_name").map(SmolStr::from)); - if let Some(user) = options.get("user") { - self.set_user(user.into()); - } - if let Some(dbname) = options.get("database") { - self.set_dbname(dbname.into()); - } - - self.pg_options = Some(options); - } - - pub fn set_project(&mut self, x: MetricsAuxInfo) { - if self.endpoint_id.is_none() { - self.set_endpoint_id(x.endpoint_id.as_str().into()) - } - self.branch = Some(x.branch_id); - self.project = Some(x.project_id); - self.set_cold_start_info(x.cold_start_info); - } - - pub fn set_project_id(&mut self, project_id: ProjectIdInt) { - self.project = Some(project_id); - } - - pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { + fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { if self.endpoint_id.is_none() { self.span.record("ep", display(&endpoint_id)); let metric = &Metrics::get().proxy.connecting_endpoints; @@ -176,44 +316,23 @@ impl RequestMonitoring { } } - pub fn set_dbname(&mut self, dbname: DbName) { + fn set_dbname(&mut self, dbname: DbName) { self.dbname = Some(dbname); } - pub fn set_user(&mut self, user: RoleName) { + fn set_user(&mut self, user: RoleName) { self.span.record("role", display(&user)); self.user = Some(user); } - pub fn set_auth_method(&mut self, auth_method: AuthMethod) { - self.auth_method = Some(auth_method); - } - - pub fn has_private_peer_addr(&self) -> bool { + fn has_private_peer_addr(&self) -> bool { match self.peer_addr { IpAddr::V4(ip) => ip.is_private(), - _ => false, + IpAddr::V6(_) => false, } } - pub fn set_error_kind(&mut self, kind: ErrorKind) { - // Do not record errors from the private address to metrics. - if !self.has_private_peer_addr() { - Metrics::get().proxy.errors_total.inc(kind); - } - if let Some(ep) = &self.endpoint_id { - let metric = &Metrics::get().proxy.endpoints_affected_by_errors; - let label = metric.with_labels(kind); - metric.get_metric(label).measure(ep); - } - self.error_kind = Some(kind); - } - - pub fn set_success(&mut self) { - self.success = true; - } - - pub fn log_connect(&mut self) { + fn log_connect(&mut self) { let outcome = if self.success { ConnectOutcome::Success } else { @@ -256,7 +375,7 @@ impl RequestMonitoring { } } -impl Drop for RequestMonitoring { +impl Drop for RequestMonitoringInner { fn drop(&mut self) { if self.sender.is_some() { self.log_connect(); diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 1355b7e1d8..e5962b35fa 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -14,30 +14,27 @@ use parquet::{ record::RecordWriter, }; use pq_proto::StartupMessageParams; -use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; +use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; use serde::ser::SerializeMap; use tokio::{sync::mpsc, time}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use crate::{ - config::{remote_storage_from_toml, OptRemoteStorageConfig}, - context::LOG_CHAN_DISCONNECT, -}; +use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; -use super::{RequestMonitoring, LOG_CHAN}; +use super::{RequestMonitoringInner, LOG_CHAN}; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { /// Storage location to upload the parquet files to. /// Encoded as toml (same format as pageservers), eg /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] - parquet_upload_remote_storage: OptRemoteStorageConfig, + #[clap(long, value_parser = remote_storage_from_toml)] + parquet_upload_remote_storage: Option, - #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] - parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig, + #[clap(long, value_parser = remote_storage_from_toml)] + parquet_upload_disconnect_events_remote_storage: Option, /// How many rows to include in a row group #[clap(long, default_value_t = 8192)] @@ -121,8 +118,8 @@ impl<'a> serde::Serialize for Options<'a> { } } -impl From<&RequestMonitoring> for RequestData { - fn from(value: &RequestMonitoring) -> Self { +impl From<&RequestMonitoringInner> for RequestData { + fn from(value: &RequestMonitoringInner) -> Self { Self { session_id: value.session_id, peer_addr: value.peer_addr.to_string(), @@ -184,8 +181,9 @@ pub async fn worker( let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); let rx = rx.map(RequestData::from); - let storage = - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?; + let storage = GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?; let properties = WriterProperties::builder() .set_data_page_size_limit(config.parquet_upload_page_size) @@ -220,6 +218,7 @@ pub async fn worker( let storage_disconnect = GenericRemoteStorage::from_config(&disconnect_events_storage_config) + .await .context("remote storage for disconnect events init")?; let parquet_config_disconnect = parquet_config.clone(); tokio::try_join!( @@ -543,10 +542,14 @@ mod tests { rx: impl Stream, ) -> Vec<(u64, usize, i64)> { let remote_storage_config = RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()), + storage: RemoteStorageKind::LocalFs { + local_path: tmpdir.to_path_buf(), + }, timeout: std::time::Duration::from_secs(120), }; - let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap(); + let storage = GenericRemoteStorage::from_config(&remote_storage_config) + .await + .unwrap(); worker_inner(storage, rx, config).await.unwrap(); @@ -733,7 +736,7 @@ mod tests { while let Some(r) = s.next().await { tx.send(r).unwrap(); } - time::sleep(time::Duration::from_secs(70)).await + time::sleep(time::Duration::from_secs(70)).await; } }); diff --git a/proxy/src/http.rs b/proxy/src/http.rs index fc7400869f..1f1dd8c415 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -4,14 +4,17 @@ pub mod health_server; -use std::{str::FromStr, sync::Arc, time::Duration}; +use std::time::Duration; + +use anyhow::bail; +use bytes::Bytes; +use http_body_util::BodyExt; +use hyper1::body::Body; +use serde::de::DeserializeOwned; -use futures::FutureExt; pub use reqwest::{Request, Response, StatusCode}; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; -use tokio::time::Instant; -use tracing::trace; use crate::{ metrics::{ConsoleRequest, Metrics}, @@ -24,8 +27,6 @@ use reqwest_middleware::RequestBuilder; /// We deliberately don't want to replace this with a public static. pub fn new_client() -> ClientWithMiddleware { let client = reqwest::ClientBuilder::new() - .dns_resolver(Arc::new(GaiResolver::default())) - .connection_verbose(true) .build() .expect("Failed to create http client"); @@ -36,8 +37,6 @@ pub fn new_client() -> ClientWithMiddleware { pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware { let timeout_client = reqwest::ClientBuilder::new() - .dns_resolver(Arc::new(GaiResolver::default())) - .connection_verbose(true) .timeout(default_timout) .build() .expect("Failed to create http client with timeout"); @@ -103,36 +102,31 @@ impl Endpoint { } } -use hyper_util::client::legacy::connect::dns::{ - GaiResolver as HyperGaiResolver, Name as HyperName, -}; -use reqwest::dns::{Addrs, Name, Resolve, Resolving}; -/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html -use tower_service::Service; -#[derive(Debug)] -pub struct GaiResolver(HyperGaiResolver); +pub async fn parse_json_body_with_limit( + mut b: impl Body + Unpin, + limit: usize, +) -> anyhow::Result { + // We could use `b.limited().collect().await.to_bytes()` here + // but this ends up being slightly more efficient as far as I can tell. -impl Default for GaiResolver { - fn default() -> Self { - Self(HyperGaiResolver::new()) - } -} + // check the lower bound of the size hint. + // in reqwest, this value is influenced by the Content-Length header. + let lower_bound = match usize::try_from(b.size_hint().lower()) { + Ok(bound) if bound <= limit => bound, + _ => bail!("Content length exceeds limit of {limit} bytes"), + }; + let mut bytes = Vec::with_capacity(lower_bound); -impl Resolve for GaiResolver { - fn resolve(&self, name: Name) -> Resolving { - let this = &mut self.0.clone(); - let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid"); - let start = Instant::now(); - Box::pin( - Service::::call(this, hyper_name).map(move |result| { - let resolve_duration = start.elapsed(); - trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete"); - result - .map(|addrs| -> Addrs { Box::new(addrs) }) - .map_err(|err| -> Box { Box::new(err) }) - }), - ) + while let Some(frame) = b.frame().await.transpose()? { + if let Ok(data) = frame.into_data() { + if bytes.len() + data.len() > limit { + bail!("Content length exceeds limit of {limit} bytes") + } + bytes.extend_from_slice(&data); + } } + + Ok(serde_json::from_slice::(&bytes)?) } #[cfg(test)] diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index e38135dd22..d418caa511 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -56,7 +56,7 @@ impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString { impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor { type Value = InternedString; - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { formatter.write_str("a string") } diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index 3243e6a140..d307d80f4a 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -3,8 +3,8 @@ use std::marker::PhantomData; use measured::{ label::NoLabels, metric::{ - gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder, - MetricEncoding, MetricFamilyEncoding, MetricType, + gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding, + MetricFamilyEncoding, MetricType, }, text::TextEncoder, LabelGroup, MetricGroup, @@ -100,7 +100,7 @@ macro_rules! jemalloc_gauge { enc: &mut TextEncoder, ) -> Result<(), std::io::Error> { if let Ok(v) = mib.read() { - enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?; + GaugeState::new(v as i64).collect_into(&(), labels, name, enc)?; } Ok(()) } diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index ea92eaaa55..b7d497ebcc 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -1,4 +1,92 @@ -#![deny(clippy::undocumented_unsafe_blocks)] +// rustc lints/lint groups +// https://doc.rust-lang.org/rustc/lints/groups.html +#![deny( + deprecated, + future_incompatible, + // TODO: consider let_underscore + nonstandard_style, + rust_2024_compatibility +)] +#![warn(clippy::all, clippy::pedantic, clippy::cargo)] +// List of denied lints from the clippy::restriction group. +// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction +#![warn( + clippy::undocumented_unsafe_blocks, + clippy::dbg_macro, + clippy::empty_enum_variants_with_brackets, + clippy::exit, + clippy::float_cmp_const, + clippy::lossy_float_literal, + clippy::macro_use_imports, + clippy::manual_ok_or, + // TODO: consider clippy::map_err_ignore + // TODO: consider clippy::mem_forget + clippy::rc_mutex, + clippy::rest_pat_in_fully_bound_structs, + clippy::string_add, + clippy::string_to_string, + clippy::todo, + // TODO: consider clippy::unimplemented + // TODO: consider clippy::unwrap_used +)] +// List of permanently allowed lints. +#![allow( + // It's ok to cast u8 to bool, etc. + clippy::cast_lossless, +)] +// List of temporarily allowed lints. +// TODO: Switch to except() once stable with 1.81. +// TODO: fix code and reduce list or move to permanent list above. +#![allow( + clippy::cargo_common_metadata, + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + clippy::cast_precision_loss, + clippy::cast_sign_loss, + clippy::default_trait_access, + clippy::doc_markdown, + clippy::explicit_iter_loop, + clippy::float_cmp, + clippy::if_not_else, + clippy::ignored_unit_patterns, + clippy::implicit_hasher, + clippy::inconsistent_struct_constructor, + clippy::inline_always, + clippy::items_after_statements, + clippy::manual_assert, + clippy::manual_let_else, + clippy::manual_string_new, + clippy::match_bool, + clippy::match_same_arms, + clippy::match_wild_err_arm, + clippy::missing_errors_doc, + clippy::missing_panics_doc, + clippy::module_name_repetitions, + clippy::multiple_crate_versions, + clippy::must_use_candidate, + clippy::needless_for_each, + clippy::needless_pass_by_value, + clippy::needless_raw_string_hashes, + clippy::option_as_ref_cloned, + clippy::redundant_closure_for_method_calls, + clippy::redundant_else, + clippy::return_self_not_must_use, + clippy::similar_names, + clippy::single_char_pattern, + clippy::single_match_else, + clippy::struct_excessive_bools, + clippy::struct_field_names, + clippy::too_many_lines, + clippy::uninlined_format_args, + clippy::unnested_or_patterns, + clippy::unreadable_literal, + clippy::unused_async, + clippy::unused_self, + clippy::used_underscore_binding, + clippy::wildcard_imports +)] +// List of temporarily allowed lints to unblock beta/nightly. +#![allow(unknown_lints, clippy::manual_inspect)] use std::convert::Infallible; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 3405b8cbc6..3b30ad8b46 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -15,7 +15,8 @@ use tracing_subscriber::{ pub async fn init() -> anyhow::Result { let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) - .from_env_lossy(); + .from_env_lossy() + .add_directive("azure_core::policies::transport=off".parse().unwrap()); let fmt_layer = tracing_subscriber::fmt::layer() .with_ansi(false) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index e2a75a8720..ccef88231b 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -2,7 +2,7 @@ use std::sync::{Arc, OnceLock}; use lasso::ThreadedRodeo; use measured::{ - label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet}, + label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet}, metric::{histogram::Thresholds, name::MetricName}, Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec, LabelGroup, MetricGroup, @@ -252,7 +252,7 @@ impl Drop for HttpEndpointPoolsGuard<'_> { } impl HttpEndpointPools { - pub fn guard(&self) -> HttpEndpointPoolsGuard { + pub fn guard(&self) -> HttpEndpointPoolsGuard<'_> { self.http_pool_endpoints_registered_total.inc(); HttpEndpointPoolsGuard { dec: &self.http_pool_endpoints_unregistered_total, @@ -370,6 +370,7 @@ pub struct CancellationRequest { pub kind: CancellationOutcome, } +#[derive(Clone, Copy)] pub enum Waiting { Cplane, Client, @@ -398,12 +399,6 @@ pub struct LatencyTimer { outcome: ConnectOutcome, } -pub struct LatencyTimerPause<'a> { - timer: &'a mut LatencyTimer, - start: time::Instant, - waiting_for: Waiting, -} - impl LatencyTimer { pub fn new(protocol: Protocol) -> Self { Self { @@ -417,11 +412,13 @@ impl LatencyTimer { } } - pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> { - LatencyTimerPause { - timer: self, - start: Instant::now(), - waiting_for, + pub fn unpause(&mut self, start: Instant, waiting_for: Waiting) { + let dur = start.elapsed(); + match waiting_for { + Waiting::Cplane => self.accumulated.cplane += dur, + Waiting::Client => self.accumulated.client += dur, + Waiting::Compute => self.accumulated.compute += dur, + Waiting::RetryTimeout => self.accumulated.retry += dur, } } @@ -438,18 +435,6 @@ impl LatencyTimer { } } -impl Drop for LatencyTimerPause<'_> { - fn drop(&mut self) { - let dur = self.start.elapsed(); - match self.waiting_for { - Waiting::Cplane => self.timer.accumulated.cplane += dur, - Waiting::Client => self.timer.accumulated.client += dur, - Waiting::Compute => self.timer.accumulated.compute += dur, - Waiting::RetryTimeout => self.timer.accumulated.retry += dur, - } - } -} - #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] pub enum ConnectOutcome { Success, @@ -577,6 +562,32 @@ impl LabelGroup for ThreadPoolWorkerId { } } +impl LabelGroupSet for ThreadPoolWorkers { + type Group<'a> = ThreadPoolWorkerId; + + fn cardinality(&self) -> Option { + Some(self.0) + } + + fn encode_dense(&self, value: Self::Unique) -> Option { + Some(value) + } + + fn decode_dense(&self, value: usize) -> Self::Group<'_> { + ThreadPoolWorkerId(value) + } + + type Unique = usize; + + fn encode(&self, value: Self::Group<'_>) -> Option { + Some(value.0) + } + + fn decode(&self, value: &Self::Unique) -> Self::Group<'_> { + ThreadPoolWorkerId(*value) + } +} + impl LabelSet for ThreadPoolWorkers { type Value<'a> = ThreadPoolWorkerId; diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 072f51958f..2182f38fe7 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -8,6 +8,7 @@ pub mod passthrough; pub mod retry; pub mod wake_compute; pub use copy_bidirectional::copy_bidirectional_client_compute; +pub use copy_bidirectional::ErrorSource; use crate::{ auth, @@ -112,18 +113,18 @@ pub async fn task_main( } }; - let mut ctx = RequestMonitoring::new( + let ctx = RequestMonitoring::new( session_id, peer_addr, crate::metrics::Protocol::Tcp, &config.region, ); - let span = ctx.span.clone(); + let span = ctx.span(); let startup = Box::pin( handle_client( config, - &mut ctx, + &ctx, cancellation_handler, socket, ClientMode::Tcp, @@ -148,8 +149,11 @@ pub async fn task_main( ctx.log_connect(); match p.proxy_pass().instrument(span.clone()).await { Ok(()) => {} - Err(e) => { - error!(parent: &span, "per-client task finished with an error: {e:#}"); + Err(ErrorSource::Client(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}"); + } + Err(ErrorSource::Compute(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}"); } } } @@ -236,7 +240,7 @@ impl ReportableError for ClientRequestError { pub async fn handle_client( config: &'static ProxyConfig, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, cancellation_handler: Arc, stream: S, mode: ClientMode, @@ -244,25 +248,25 @@ pub async fn handle_client( conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { info!( - protocol = %ctx.protocol, + protocol = %ctx.protocol(), "handling interactive connection from client" ); let metrics = &Metrics::get().proxy; - let proto = ctx.protocol; + let proto = ctx.protocol(); let _request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); let record_handshake_error = !ctx.has_private_peer_addr(); - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client); - let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); + let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error); let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { return Ok(cancellation_handler - .cancel_session(cancel_key_data, ctx.session_id) + .cancel_session(cancel_key_data, ctx.session_id()) .await .map(|()| None)?) } diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 409d45b39a..f38e43ba5a 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -7,7 +7,7 @@ use crate::{ error::ReportableError, metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType}, proxy::{ - retry::{retry_after, ShouldRetry}, + retry::{retry_after, should_retry, CouldRetry}, wake_compute::wake_compute, }, Host, @@ -17,6 +17,8 @@ use pq_proto::StartupMessageParams; use tokio::time; use tracing::{error, info, warn}; +use super::retry::ShouldRetryWakeCompute; + const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); /// If we couldn't connect, a cached connection info might be to blame @@ -44,7 +46,7 @@ pub trait ConnectMechanism { type Error: From; async fn connect_once( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result; @@ -56,7 +58,7 @@ pub trait ConnectMechanism { pub trait ComputeConnectBackend { async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result; fn get_keys(&self) -> Option<&ComputeCredentialKeys>; @@ -79,7 +81,7 @@ impl ConnectMechanism for TcpMechanism<'_> { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] async fn connect_once( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result { @@ -96,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> { /// Try to connect to the compute node, retrying if necessary. #[tracing::instrument(skip_all)] pub async fn connect_to_compute( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, mechanism: &M, user_info: &B, allow_self_signed_compute: bool, @@ -104,7 +106,7 @@ pub async fn connect_to_compute( connect_to_compute_retry_config: RetryConfig, ) -> Result where - M::ConnectError: ShouldRetry + std::fmt::Debug, + M::ConnectError: CouldRetry + ShouldRetryWakeCompute + std::fmt::Debug, M::Error: From, { let mut num_retries = 0; @@ -124,7 +126,7 @@ where .await { Ok(res) => { - ctx.latency_timer.success(); + ctx.success(); Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, @@ -139,10 +141,10 @@ where error!(error = ?err, "could not connect to compute node"); - let node_info = if !node_info.cached() || !err.should_retry_database_address() { + let node_info = if !node_info.cached() || !err.should_retry_wake_compute() { // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry. // Do not need to retrieve a new node_info, just return the old one. - if !err.should_retry(num_retries, connect_to_compute_retry_config) { + if should_retry(&err, num_retries, connect_to_compute_retry_config) { Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, @@ -176,7 +178,7 @@ where .await { Ok(res) => { - ctx.latency_timer.success(); + ctx.success(); Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, @@ -188,9 +190,8 @@ where return Ok(res); } Err(e) => { - let retriable = e.should_retry(num_retries, connect_to_compute_retry_config); - if !retriable { - error!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); + if !should_retry(&e, num_retries, connect_to_compute_retry_config) { + error!(error = ?e, num_retries, retriable = false, "couldn't connect to compute node"); Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, @@ -200,16 +201,15 @@ where ); return Err(e.into()); } - warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); + + warn!(error = ?e, num_retries, retriable = true, "couldn't connect to compute node"); } - } + }; let wait_duration = retry_after(num_retries, connect_to_compute_retry_config); num_retries += 1; - let pause = ctx - .latency_timer - .pause(crate::metrics::Waiting::RetryTimeout); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); time::sleep(wait_duration).await; drop(pause); } diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index aaf3688f21..048523f69c 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -13,12 +13,39 @@ enum TransferState { Done(u64), } +#[derive(Debug)] +pub enum ErrorDirection { + Read(io::Error), + Write(io::Error), +} + +impl ErrorSource { + fn from_client(err: ErrorDirection) -> ErrorSource { + match err { + ErrorDirection::Read(client) => Self::Client(client), + ErrorDirection::Write(compute) => Self::Compute(compute), + } + } + fn from_compute(err: ErrorDirection) -> ErrorSource { + match err { + ErrorDirection::Write(client) => Self::Client(client), + ErrorDirection::Read(compute) => Self::Compute(compute), + } + } +} + +#[derive(Debug)] +pub enum ErrorSource { + Client(io::Error), + Compute(io::Error), +} + fn transfer_one_direction( cx: &mut Context<'_>, state: &mut TransferState, r: &mut A, w: &mut B, -) -> Poll> +) -> Poll> where A: AsyncRead + AsyncWrite + Unpin + ?Sized, B: AsyncRead + AsyncWrite + Unpin + ?Sized, @@ -32,7 +59,7 @@ where *state = TransferState::ShuttingDown(count); } TransferState::ShuttingDown(count) => { - ready!(w.as_mut().poll_shutdown(cx))?; + ready!(w.as_mut().poll_shutdown(cx)).map_err(ErrorDirection::Write)?; *state = TransferState::Done(*count); } TransferState::Done(count) => return Poll::Ready(Ok(*count)), @@ -44,7 +71,7 @@ where pub async fn copy_bidirectional_client_compute( client: &mut Client, compute: &mut Compute, -) -> Result<(u64, u64), std::io::Error> +) -> Result<(u64, u64), ErrorSource> where Client: AsyncRead + AsyncWrite + Unpin + ?Sized, Compute: AsyncRead + AsyncWrite + Unpin + ?Sized, @@ -54,9 +81,11 @@ where poll_fn(|cx| { let mut client_to_compute_result = - transfer_one_direction(cx, &mut client_to_compute, client, compute)?; + transfer_one_direction(cx, &mut client_to_compute, client, compute) + .map_err(ErrorSource::from_client)?; let mut compute_to_client_result = - transfer_one_direction(cx, &mut compute_to_client, compute, client)?; + transfer_one_direction(cx, &mut compute_to_client, compute, client) + .map_err(ErrorSource::from_compute)?; // Early termination checks from compute to client. if let TransferState::Done(_) = compute_to_client { @@ -65,18 +94,20 @@ where // Initiate shutdown client_to_compute = TransferState::ShuttingDown(buf.amt); client_to_compute_result = - transfer_one_direction(cx, &mut client_to_compute, client, compute)?; + transfer_one_direction(cx, &mut client_to_compute, client, compute) + .map_err(ErrorSource::from_client)?; } } - // Early termination checks from compute to client. + // Early termination checks from client to compute. if let TransferState::Done(_) = client_to_compute { if let TransferState::Running(buf) = &compute_to_client { info!("Client is done, terminate compute"); // Initiate shutdown compute_to_client = TransferState::ShuttingDown(buf.amt); compute_to_client_result = - transfer_one_direction(cx, &mut compute_to_client, client, compute)?; + transfer_one_direction(cx, &mut compute_to_client, compute, client) + .map_err(ErrorSource::from_compute)?; } } @@ -138,7 +169,7 @@ impl CopyBuffer { cx: &mut Context<'_>, mut reader: Pin<&mut R>, mut writer: Pin<&mut W>, - ) -> Poll> + ) -> Poll> where R: AsyncRead + ?Sized, W: AsyncWrite + ?Sized, @@ -149,11 +180,11 @@ impl CopyBuffer { // Top up the buffer towards full if we can read a bit more // data - this should improve the chances of a large write if !me.read_done && me.cap < me.buf.len() { - ready!(me.poll_fill_buf(cx, reader.as_mut()))?; + ready!(me.poll_fill_buf(cx, reader.as_mut())).map_err(ErrorDirection::Read)?; } Poll::Pending } - res => res, + res @ Poll::Ready(_) => res.map_err(ErrorDirection::Write), } } @@ -162,7 +193,7 @@ impl CopyBuffer { cx: &mut Context<'_>, mut reader: Pin<&mut R>, mut writer: Pin<&mut W>, - ) -> Poll> + ) -> Poll> where R: AsyncRead + ?Sized, W: AsyncWrite + ?Sized, @@ -176,12 +207,13 @@ impl CopyBuffer { match self.poll_fill_buf(cx, reader.as_mut()) { Poll::Ready(Ok(())) => (), - Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Ready(Err(err)) => return Poll::Ready(Err(ErrorDirection::Read(err))), Poll::Pending => { // Try flushing when the reader has no progress to avoid deadlock // when the reader depends on buffered writer. if self.need_flush { - ready!(writer.as_mut().poll_flush(cx))?; + ready!(writer.as_mut().poll_flush(cx)) + .map_err(ErrorDirection::Write)?; self.need_flush = false; } @@ -194,10 +226,10 @@ impl CopyBuffer { while self.pos < self.cap { let i = ready!(self.poll_write_buf(cx, reader.as_mut(), writer.as_mut()))?; if i == 0 { - return Poll::Ready(Err(io::Error::new( + return Poll::Ready(Err(ErrorDirection::Write(io::Error::new( io::ErrorKind::WriteZero, "write zero byte into writer", - ))); + )))); } else { self.pos += i; self.amt += i as u64; @@ -216,7 +248,7 @@ impl CopyBuffer { // If we've written all the data and we've seen EOF, flush out the // data and finish the transfer. if self.pos == self.cap && self.read_done { - ready!(writer.as_mut().poll_flush(cx))?; + ready!(writer.as_mut().poll_flush(cx)).map_err(ErrorDirection::Write)?; return Poll::Ready(Ok(self.amt)); } } diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index dd935cc245..27a72f8072 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -1,11 +1,18 @@ -use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams}; +use bytes::Buf; +use pq_proto::{ + framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, + StartupMessageParams, +}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::info; +use tracing::{info, warn}; use crate::{ - config::TlsConfig, + auth::endpoint_sni, + config::{TlsConfig, PG_ALPN_PROTOCOL}, + context::RequestMonitoring, error::ReportableError, + metrics::Metrics, proxy::ERR_INSECURE_CONNECTION, stream::{PqStream, Stream, StreamUpgradeError}, }; @@ -61,6 +68,7 @@ pub enum HandshakeData { /// we also take an extra care of propagating only the select handshake errors to client. #[tracing::instrument(skip_all)] pub async fn handshake( + ctx: &RequestMonitoring, stream: S, mut tls: Option<&TlsConfig>, record_handshake_error: bool, @@ -68,52 +76,112 @@ pub async fn handshake( // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); + const PG_PROTOCOL_EARLIEST: ProtocolVersion = ProtocolVersion::new(3, 0); + const PG_PROTOCOL_LATEST: ProtocolVersion = ProtocolVersion::new(3, 0); + let mut stream = PqStream::new(Stream::from_raw(stream)); loop { let msg = stream.read_startup_packet().await?; - info!("received {msg:?}"); - - use FeStartupPacket::*; match msg { - SslRequest => match stream.get_ref() { + FeStartupPacket::SslRequest { direct } => match stream.get_ref() { Stream::Raw { .. } if !tried_ssl => { tried_ssl = true; // We can't perform TLS handshake without a config - let enc = tls.is_some(); - stream.write_message(&Be::EncryptionResponse(enc)).await?; + let have_tls = tls.is_some(); + if !direct { + stream + .write_message(&Be::EncryptionResponse(have_tls)) + .await?; + } else if !have_tls { + return Err(HandshakeError::ProtocolViolation); + } + if let Some(tls) = tls.take() { // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. - let (raw, read_buf) = stream.into_inner(); - // TODO: Normally, client doesn't send any data before - // server says TLS handshake is ok and read_buf is empy. - // However, you could imagine pipelining of postgres - // SSLRequest + TLS ClientHello in one hunk similar to - // pipelining in our node js driver. We should probably - // support that by chaining read_buf with the stream. + let Framed { + stream: raw, + read_buf, + write_buf, + } = stream.framed; + + let Stream::Raw { raw } = raw else { + return Err(HandshakeError::StreamUpgradeError( + StreamUpgradeError::AlreadyTls, + )); + }; + + let mut read_buf = read_buf.reader(); + let mut res = Ok(()); + let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config()) + .accept_with(raw, |session| { + // push the early data to the tls session + while !read_buf.get_ref().is_empty() { + match session.read_tls(&mut read_buf) { + Ok(_) => {} + Err(e) => { + res = Err(e); + break; + } + } + } + }); + + res?; + + let read_buf = read_buf.into_inner(); if !read_buf.is_empty() { return Err(HandshakeError::EarlyData); } - let tls_stream = raw - .upgrade(tls.to_server_config(), record_handshake_error) - .await?; + + let tls_stream = accept.await.inspect_err(|_| { + if record_handshake_error { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + })?; + + let conn_info = tls_stream.get_ref().1; + + // try parse endpoint + let ep = conn_info + .server_name() + .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten()); + if let Some(ep) = ep { + ctx.set_endpoint_id(ep); + } + + // check the ALPN, if exists, as required. + match conn_info.alpn_protocol() { + None | Some(PG_ALPN_PROTOCOL) => {} + Some(other) => { + let alpn = String::from_utf8_lossy(other); + warn!(%alpn, "unexpected ALPN"); + return Err(HandshakeError::ProtocolViolation); + } + } let (_, tls_server_end_point) = tls .cert_resolver - .resolve(tls_stream.get_ref().1.server_name()) + .resolve(conn_info.server_name()) .ok_or(HandshakeError::MissingCertificate)?; - stream = PqStream::new(Stream::Tls { - tls: Box::new(tls_stream), - tls_server_end_point, - }); + stream = PqStream { + framed: Framed { + stream: Stream::Tls { + tls: Box::new(tls_stream), + tls_server_end_point, + }, + read_buf, + write_buf, + }, + }; } } _ => return Err(HandshakeError::ProtocolViolation), }, - GssEncRequest => match stream.get_ref() { + FeStartupPacket::GssEncRequest => match stream.get_ref() { Stream::Raw { .. } if !tried_gss => { tried_gss = true; @@ -122,7 +190,9 @@ pub async fn handshake( } _ => return Err(HandshakeError::ProtocolViolation), }, - StartupMessage { params, .. } => { + FeStartupPacket::StartupMessage { params, version } + if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST => + { // Check that the config has been consumed during upgrade // OR we didn't provide it at all (for dev purposes). if tls.is_some() { @@ -131,10 +201,54 @@ pub async fn handshake( .await?; } - info!(session_type = "normal", "successful handshake"); + info!( + ?version, + ?params, + session_type = "normal", + "successful handshake" + ); break Ok(HandshakeData::Startup(stream, params)); } - CancelRequest(cancel_key_data) => { + // downgrade protocol version + FeStartupPacket::StartupMessage { params, version } + if version.major() == 3 && version > PG_PROTOCOL_LATEST => + { + warn!(?version, "unsupported minor version"); + + // no protocol extensions are supported. + // + let mut unsupported = vec![]; + for (k, _) in params.iter() { + if k.starts_with("_pq_.") { + unsupported.push(k); + } + } + + // TODO: remove unsupported options so we don't send them to compute. + + stream + .write_message(&Be::NegotiateProtocolVersion { + version: PG_PROTOCOL_LATEST, + options: &unsupported, + }) + .await?; + + info!( + ?version, + session_type = "normal", + "successful handshake; unsupported minor version requested" + ); + break Ok(HandshakeData::Startup(stream, params)); + } + FeStartupPacket::StartupMessage { version, .. } => { + warn!( + ?version, + session_type = "normal", + "unsuccessful handshake; unsupported version" + ); + return Err(HandshakeError::ProtocolViolation); + } + FeStartupPacket::CancelRequest(cancel_key_data) => { info!(session_type = "cancellation", "successful handshake"); break Ok(HandshakeData::Cancel(cancel_key_data)); } diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index 62de79946f..9942fac383 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -10,13 +10,15 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use utils::measured_stream::MeasuredStream; +use super::copy_bidirectional::ErrorSource; + /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] pub async fn proxy_pass( client: impl AsyncRead + AsyncWrite + Unpin, compute: impl AsyncRead + AsyncWrite + Unpin, aux: MetricsAuxInfo, -) -> anyhow::Result<()> { +) -> Result<(), ErrorSource> { let usage = USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, @@ -66,9 +68,11 @@ pub struct ProxyPassthrough { } impl ProxyPassthrough { - pub async fn proxy_pass(self) -> anyhow::Result<()> { + pub async fn proxy_pass(self) -> Result<(), ErrorSource> { let res = proxy_pass(self.client, self.compute.stream, self.aux).await; - self.compute.cancel_closure.try_cancel_query().await?; + if let Err(err) = self.compute.cancel_closure.try_cancel_query().await { + tracing::error!(?err, "could not cancel the query in the database"); + } res } } diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 8dec1f1137..644b183a91 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -2,20 +2,22 @@ use crate::{compute, config::RetryConfig}; use std::{error::Error, io}; use tokio::time; -pub trait ShouldRetry { +pub trait CouldRetry { + /// Returns true if the error could be retried fn could_retry(&self) -> bool; - fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool { - match self { - _ if num_retries >= config.max_retries => false, - err => err.could_retry(), - } - } - fn should_retry_database_address(&self) -> bool { - true - } } -impl ShouldRetry for io::Error { +pub trait ShouldRetryWakeCompute { + /// Returns true if we need to invalidate the cache for this node. + /// If false, we can continue retrying with the current node cache. + fn should_retry_wake_compute(&self) -> bool; +} + +pub fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool { + num_retries < config.max_retries && err.could_retry() +} + +impl CouldRetry for io::Error { fn could_retry(&self) -> bool { use std::io::ErrorKind; matches!( @@ -25,7 +27,7 @@ impl ShouldRetry for io::Error { } } -impl ShouldRetry for tokio_postgres::error::DbError { +impl CouldRetry for tokio_postgres::error::DbError { fn could_retry(&self) -> bool { use tokio_postgres::error::SqlState; matches!( @@ -36,7 +38,9 @@ impl ShouldRetry for tokio_postgres::error::DbError { | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, ) } - fn should_retry_database_address(&self) -> bool { +} +impl ShouldRetryWakeCompute for tokio_postgres::error::DbError { + fn should_retry_wake_compute(&self) -> bool { use tokio_postgres::error::SqlState; // Here are errors that happens after the user successfully authenticated to the database. // TODO: there are pgbouncer errors that should be retried, but they are not listed here. @@ -53,7 +57,7 @@ impl ShouldRetry for tokio_postgres::error::DbError { } } -impl ShouldRetry for tokio_postgres::Error { +impl CouldRetry for tokio_postgres::Error { fn could_retry(&self) -> bool { if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { io::Error::could_retry(io_err) @@ -63,29 +67,33 @@ impl ShouldRetry for tokio_postgres::Error { false } } - fn should_retry_database_address(&self) -> bool { - if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { - io::Error::should_retry_database_address(io_err) - } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { - tokio_postgres::error::DbError::should_retry_database_address(db_err) +} +impl ShouldRetryWakeCompute for tokio_postgres::Error { + fn should_retry_wake_compute(&self) -> bool { + if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { + tokio_postgres::error::DbError::should_retry_wake_compute(db_err) } else { + // likely an IO error. Possible the compute has shutdown and the + // cache is stale. true } } } -impl ShouldRetry for compute::ConnectionError { +impl CouldRetry for compute::ConnectionError { fn could_retry(&self) -> bool { match self { compute::ConnectionError::Postgres(err) => err.could_retry(), compute::ConnectionError::CouldNotConnect(err) => err.could_retry(), + compute::ConnectionError::WakeComputeError(err) => err.could_retry(), _ => false, } } - fn should_retry_database_address(&self) -> bool { +} +impl ShouldRetryWakeCompute for compute::ConnectionError { + fn should_retry_wake_compute(&self) -> bool { match self { - compute::ConnectionError::Postgres(err) => err.should_retry_database_address(), - compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(), + compute::ConnectionError::Postgres(err) => err.should_retry_wake_compute(), // the cache entry was not checked for validity compute::ConnectionError::TooManyConnectionAttempts(_) => false, _ => true, diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 96683511fe..d8308c4f2a 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -5,21 +5,21 @@ mod mitm; use std::time::Duration; use super::connect_compute::ConnectMechanism; -use super::retry::ShouldRetry; +use super::retry::CouldRetry; use super::*; use crate::auth::backend::{ ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend, }; use crate::config::{CertResolver, RetryConfig}; use crate::console::caches::NodeInfoCache; -use crate::console::messages::{ConsoleError, MetricsAuxInfo}; +use crate::console::messages::{ConsoleError, Details, MetricsAuxInfo, Status}; use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend}; use crate::console::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; -use crate::proxy::retry::retry_after; use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId}; use anyhow::{bail, Context}; use async_trait::async_trait; +use retry::{retry_after, ShouldRetryWakeCompute}; use rstest::rstest; use rustls::pki_types; use tokio_postgres::config::SslMode; @@ -155,7 +155,7 @@ impl TestAuth for Scram { stream: &mut PqStream>, ) -> anyhow::Result<()> { let outcome = auth::AuthFlow::new(stream) - .begin(auth::Scram(&self.0, &mut RequestMonitoring::test())) + .begin(auth::Scram(&self.0, &RequestMonitoring::test())) .await? .authenticate() .await?; @@ -175,10 +175,11 @@ async fn dummy_proxy( auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let (client, _) = read_proxy_protocol(client).await?; - let mut stream = match handshake(client, tls.as_ref(), false).await? { - HandshakeData::Startup(stream, _) => stream, - HandshakeData::Cancel(_) => bail!("cancellation not supported"), - }; + let mut stream = + match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? { + HandshakeData::Startup(stream, _) => stream, + HandshakeData::Cancel(_) => bail!("cancellation not supported"), + }; auth.authenticate(&mut stream).await?; @@ -438,11 +439,16 @@ impl std::fmt::Display for TestConnectError { impl std::error::Error for TestConnectError {} -impl ShouldRetry for TestConnectError { +impl CouldRetry for TestConnectError { fn could_retry(&self) -> bool { self.retryable } } +impl ShouldRetryWakeCompute for TestConnectError { + fn should_retry_wake_compute(&self) -> bool { + true + } +} #[async_trait] impl ConnectMechanism for TestConnectMechanism { @@ -452,7 +458,7 @@ impl ConnectMechanism for TestConnectMechanism { async fn connect_once( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _node_info: &console::CachedNodeInfo, _timeout: std::time::Duration, ) -> Result { @@ -485,7 +491,7 @@ impl TestBackend for TestConnectMechanism { ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), ConnectAction::WakeFail => { let err = console::errors::ApiError::Console(ConsoleError { - http_status_code: http::StatusCode::FORBIDDEN, + http_status_code: http::StatusCode::BAD_REQUEST, error: "TEST".into(), status: None, }); @@ -496,7 +502,15 @@ impl TestBackend for TestConnectMechanism { let err = console::errors::ApiError::Console(ConsoleError { http_status_code: http::StatusCode::BAD_REQUEST, error: "TEST".into(), - status: None, + status: Some(Status { + code: "error".into(), + message: "error".into(), + details: Details { + error_info: None, + retry_info: Some(console::messages::RetryInfo { retry_delay_ms: 1 }), + user_facing_message: None, + }, + }), }); assert!(err.could_retry()); Err(console::errors::WakeComputeError::ApiError(err)) @@ -527,8 +541,8 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn }, allow_self_signed_compute: false, }; - let (_, node) = cache.insert("key".into(), node); - node + let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone())); + node2.map(|()| node) } fn helper_create_connect_info( @@ -552,7 +566,7 @@ fn helper_create_connect_info( async fn connect_to_compute_success() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -560,7 +574,7 @@ async fn connect_to_compute_success() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -570,7 +584,7 @@ async fn connect_to_compute_success() { async fn connect_to_compute_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -578,7 +592,7 @@ async fn connect_to_compute_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -589,7 +603,7 @@ async fn connect_to_compute_retry() { async fn connect_to_compute_non_retry_1() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -597,7 +611,7 @@ async fn connect_to_compute_non_retry_1() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); @@ -608,7 +622,7 @@ async fn connect_to_compute_non_retry_1() { async fn connect_to_compute_non_retry_2() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -616,7 +630,7 @@ async fn connect_to_compute_non_retry_2() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -628,7 +642,7 @@ async fn connect_to_compute_non_retry_3() { let _ = env_logger::try_init(); tokio::time::pause(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]); let user_info = helper_create_connect_info(&mechanism); @@ -643,7 +657,7 @@ async fn connect_to_compute_non_retry_3() { backoff_factor: 2.0, }; connect_to_compute( - &mut ctx, + &ctx, &mechanism, &user_info, false, @@ -660,7 +674,7 @@ async fn connect_to_compute_non_retry_3() { async fn wake_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -668,7 +682,7 @@ async fn wake_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -679,7 +693,7 @@ async fn wake_retry() { async fn wake_non_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -687,7 +701,7 @@ async fn wake_non_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index cbfc9f1358..2d752b9183 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -1,7 +1,7 @@ //! Man-in-the-middle tests //! //! Channel binding should prevent a proxy server -//! - that has access to create valid certificates - +//! *that has access to create valid certificates* //! from controlling the TLS connection. use std::fmt::Debug; @@ -34,9 +34,14 @@ async fn proxy_mitm( tokio::spawn(async move { // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; - let (end_client, startup) = match handshake(client1, Some(&server_config1), false) - .await - .unwrap() + let (end_client, startup) = match handshake( + &RequestMonitoring::test(), + client1, + Some(&server_config1), + false, + ) + .await + .unwrap() { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(_) => panic!("cancellation not supported"), @@ -63,7 +68,7 @@ async fn proxy_mitm( end_client.send(Bytes::from_static(b"R\0\0\0\x17\0\0\0\x0aSCRAM-SHA-256\0\0")).await.unwrap(); continue; } - end_client.send(message).await.unwrap() + end_client.send(message).await.unwrap(); } _ => break, } @@ -83,7 +88,7 @@ async fn proxy_mitm( end_server.send(buf.freeze()).await.unwrap(); continue; } - end_server.send(message).await.unwrap() + end_server.send(message).await.unwrap(); } _ => break, } diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index c166cf4389..5b06e8f054 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,30 +1,27 @@ use crate::config::RetryConfig; -use crate::console::messages::ConsoleError; +use crate::console::messages::{ConsoleError, Reason}; use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo}; use crate::context::RequestMonitoring; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, WakeupFailureKind, }; -use crate::proxy::retry::retry_after; +use crate::proxy::retry::{retry_after, should_retry}; use hyper1::StatusCode; -use std::ops::ControlFlow; use tracing::{error, info, warn}; use super::connect_compute::ComputeConnectBackend; -use super::retry::ShouldRetry; pub async fn wake_compute( num_retries: &mut u32, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, api: &B, config: RetryConfig, ) -> Result { let retry_type = RetryType::WakeCompute; loop { - let wake_res = api.wake_compute(ctx).await; - match handle_try_wake(wake_res, *num_retries, config) { - Err(e) => { + match api.wake_compute(ctx).await { + Err(e) if !should_retry(&e, *num_retries, config) => { error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); report_error(&e, false); Metrics::get().proxy.retries_metric.observe( @@ -36,11 +33,11 @@ pub async fn wake_compute( ); return Err(e); } - Ok(ControlFlow::Continue(e)) => { + Err(e) => { warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); report_error(&e, true); } - Ok(ControlFlow::Break(n)) => { + Ok(n) => { Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, @@ -55,78 +52,34 @@ pub async fn wake_compute( let wait_duration = retry_after(*num_retries, config); *num_retries += 1; - let pause = ctx - .latency_timer - .pause(crate::metrics::Waiting::RetryTimeout); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); tokio::time::sleep(wait_duration).await; drop(pause); } } -/// Attempts to wake up the compute node. -/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable -/// * Returns Ok(Break(node)) if the wakeup succeeded -/// * Returns Err(e) if there was an error -pub fn handle_try_wake( - result: Result, - num_retries: u32, - config: RetryConfig, -) -> Result, WakeComputeError> { - match result { - Err(err) => match &err { - WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => { - Ok(ControlFlow::Continue(err)) - } - _ => Err(err), - }, - // Ready to try again. - Ok(new) => Ok(ControlFlow::Break(new)), - } -} - fn report_error(e: &WakeComputeError, retry: bool) { use crate::console::errors::ApiError; let kind = match e { WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress, WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError, WakeComputeError::ApiError(ApiError::Console(e)) => match e.get_reason() { - crate::console::messages::Reason::RoleProtected => { - WakeupFailureKind::ApiConsoleBadRequest - } - crate::console::messages::Reason::ResourceNotFound => { - WakeupFailureKind::ApiConsoleBadRequest - } - crate::console::messages::Reason::ProjectNotFound => { - WakeupFailureKind::ApiConsoleBadRequest - } - crate::console::messages::Reason::EndpointNotFound => { - WakeupFailureKind::ApiConsoleBadRequest - } - crate::console::messages::Reason::BranchNotFound => { - WakeupFailureKind::ApiConsoleBadRequest - } - crate::console::messages::Reason::RateLimitExceeded => { - WakeupFailureKind::ApiConsoleLocked - } - crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => { - WakeupFailureKind::QuotaExceeded - } - crate::console::messages::Reason::ActiveTimeQuotaExceeded => { - WakeupFailureKind::QuotaExceeded - } - crate::console::messages::Reason::ComputeTimeQuotaExceeded => { - WakeupFailureKind::QuotaExceeded - } - crate::console::messages::Reason::WrittenDataQuotaExceeded => { - WakeupFailureKind::QuotaExceeded - } - crate::console::messages::Reason::DataTransferQuotaExceeded => { - WakeupFailureKind::QuotaExceeded - } - crate::console::messages::Reason::LogicalSizeQuotaExceeded => { - WakeupFailureKind::QuotaExceeded - } - crate::console::messages::Reason::Unknown => match e { + Reason::RoleProtected => WakeupFailureKind::ApiConsoleBadRequest, + Reason::ResourceNotFound => WakeupFailureKind::ApiConsoleBadRequest, + Reason::ProjectNotFound => WakeupFailureKind::ApiConsoleBadRequest, + Reason::EndpointNotFound => WakeupFailureKind::ApiConsoleBadRequest, + Reason::BranchNotFound => WakeupFailureKind::ApiConsoleBadRequest, + Reason::RateLimitExceeded => WakeupFailureKind::ApiConsoleLocked, + Reason::NonDefaultBranchComputeTimeExceeded => WakeupFailureKind::QuotaExceeded, + Reason::ActiveTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded, + Reason::ComputeTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded, + Reason::WrittenDataQuotaExceeded => WakeupFailureKind::QuotaExceeded, + Reason::DataTransferQuotaExceeded => WakeupFailureKind::QuotaExceeded, + Reason::LogicalSizeQuotaExceeded => WakeupFailureKind::QuotaExceeded, + Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked, + Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked, + Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked, + Reason::Unknown => match e { ConsoleError { http_status_code: StatusCode::LOCKED, ref error, diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index be9072dd8c..222cd431d2 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -3,4 +3,8 @@ mod limiter; pub use limit_algorithm::{ aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; -pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo}; +pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; +mod leaky_bucket; +pub use leaky_bucket::{ + EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState, +}; diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs new file mode 100644 index 0000000000..2d5e056540 --- /dev/null +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -0,0 +1,171 @@ +use std::{ + hash::Hash, + sync::atomic::{AtomicUsize, Ordering}, +}; + +use ahash::RandomState; +use dashmap::DashMap; +use rand::{thread_rng, Rng}; +use tokio::time::Instant; +use tracing::info; + +use crate::intern::EndpointIdInt; + +// Simple per-endpoint rate limiter. +pub type EndpointRateLimiter = LeakyBucketRateLimiter; + +pub struct LeakyBucketRateLimiter { + map: DashMap, + config: LeakyBucketConfig, + access_count: AtomicUsize, +} + +impl LeakyBucketRateLimiter { + pub const DEFAULT: LeakyBucketConfig = LeakyBucketConfig { + rps: 600.0, + max: 1500.0, + }; + + pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self { + Self { + map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards), + config, + access_count: AtomicUsize::new(0), + } + } + + /// Check that number of connections to the endpoint is below `max_rps` rps. + pub fn check(&self, key: K, n: u32) -> bool { + let now = Instant::now(); + + if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 { + self.do_gc(now); + } + + let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState { + time: now, + filled: 0.0, + }); + + entry.check(&self.config, now, n as f64) + } + + fn do_gc(&self, now: Instant) { + info!( + "cleaning up bucket rate limiter, current size = {}", + self.map.len() + ); + let n = self.map.shards().len(); + let shard = thread_rng().gen_range(0..n); + self.map.shards()[shard] + .write() + .retain(|_, value| !value.get_mut().update(&self.config, now)); + } +} + +pub struct LeakyBucketConfig { + pub rps: f64, + pub max: f64, +} + +pub struct LeakyBucketState { + filled: f64, + time: Instant, +} + +impl LeakyBucketConfig { + pub fn new(rps: f64, max: f64) -> Self { + assert!(rps > 0.0, "rps must be positive"); + assert!(max > 0.0, "max must be positive"); + Self { rps, max } + } +} + +impl LeakyBucketState { + pub fn new() -> Self { + Self { + filled: 0.0, + time: Instant::now(), + } + } + + /// updates the timer and returns true if the bucket is empty + fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool { + let drain = now.duration_since(self.time); + let drain = drain.as_secs_f64() * info.rps; + + self.filled = (self.filled - drain).clamp(0.0, info.max); + self.time = now; + + self.filled == 0.0 + } + + pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool { + self.update(info, now); + + if self.filled + n > info.max { + return false; + } + self.filled += n; + + true + } +} + +impl Default for LeakyBucketState { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use tokio::time::Instant; + + use super::{LeakyBucketConfig, LeakyBucketState}; + + #[tokio::test(start_paused = true)] + async fn check() { + let info = LeakyBucketConfig::new(500.0, 2000.0); + let mut bucket = LeakyBucketState::new(); + + // should work for 2000 requests this second + for _ in 0..2000 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + assert!(!bucket.check(&info, Instant::now(), 1.0)); + assert_eq!(bucket.filled, 2000.0); + + // in 1ms we should drain 0.5 tokens. + // make sure we don't lose any tokens + tokio::time::advance(Duration::from_millis(1)).await; + assert!(!bucket.check(&info, Instant::now(), 1.0)); + tokio::time::advance(Duration::from_millis(1)).await; + assert!(bucket.check(&info, Instant::now(), 1.0)); + + // in 10ms we should drain 5 tokens + tokio::time::advance(Duration::from_millis(10)).await; + for _ in 0..5 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + assert!(!bucket.check(&info, Instant::now(), 1.0)); + + // in 10s we should drain 5000 tokens + // but cap is only 2000 + tokio::time::advance(Duration::from_secs(10)).await; + for _ in 0..2000 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + assert!(!bucket.check(&info, Instant::now(), 1.0)); + + // should sustain 500rps + for _ in 0..2000 { + tokio::time::advance(Duration::from_millis(10)).await; + for _ in 0..5 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + } + } +} diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index 3842ce269e..80a62b2a76 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -237,7 +237,7 @@ impl Token { } pub fn release(mut self, outcome: Outcome) { - self.release_mut(Some(outcome)) + self.release_mut(Some(outcome)); } pub fn release_mut(&mut self, outcome: Option) { @@ -249,7 +249,7 @@ impl Token { impl Drop for Token { fn drop(&mut self) { - self.release_mut(None) + self.release_mut(None); } } diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index b39740bb21..d669492fa6 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -25,9 +25,8 @@ pub struct Aimd { impl LimitAlgorithm for Aimd { fn update(&self, old_limit: usize, sample: Sample) -> usize { - use Outcome::*; match sample.outcome { - Success => { + Outcome::Success => { let utilisation = sample.in_flight as f32 / old_limit as f32; if utilisation > self.utilisation { @@ -42,7 +41,7 @@ impl LimitAlgorithm for Aimd { old_limit } } - Overload => { + Outcome::Overload => { let limit = old_limit as f32 * self.dec; // Floor instead of round, so the limit reduces even with small numbers. diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index b8c9490696..5db4efed37 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -61,7 +61,7 @@ impl GlobalRateLimiter { // Purposefully ignore user name and database name as clients can reconnect // with different names, so we'll end up sending some http requests to // the control plane. -pub type EndpointRateLimiter = BucketRateLimiter; +pub type WakeComputeRateLimiter = BucketRateLimiter; pub struct BucketRateLimiter { map: DashMap, Hasher>, @@ -103,7 +103,7 @@ pub struct RateBucketInfo { impl std::fmt::Display for RateBucketInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64; + let rps = self.rps().floor() as u64; write!(f, "{rps}@{}", humantime::format_duration(self.interval)) } } @@ -140,6 +140,10 @@ impl RateBucketInfo { Self::new(200, Duration::from_secs(600)), ]; + pub fn rps(&self) -> f64 { + (self.max_rpi as f64) / self.interval.as_secs_f64() + } + pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info @@ -245,7 +249,7 @@ mod tests { use rustc_hash::FxHasher; use tokio::time; - use super::{BucketRateLimiter, EndpointRateLimiter}; + use super::{BucketRateLimiter, WakeComputeRateLimiter}; use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId}; #[test] @@ -293,7 +297,7 @@ mod tests { .map(|s| s.parse().unwrap()) .collect(); RateBucketInfo::validate(&mut rates).unwrap(); - let limiter = EndpointRateLimiter::new(rates); + let limiter = WakeComputeRateLimiter::new(rates); let endpoint = EndpointId::from("ep-my-endpoint-1234"); let endpoint = EndpointIdInt::from(endpoint); diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 7baf104374..c9a946fa4a 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -106,7 +106,7 @@ impl RedisPublisherClient { cancel_key_data, session_id, }))?; - self.client.publish(PROXY_CHANNEL_NAME, payload).await?; + let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?; Ok(()) } pub async fn try_connect(&mut self) -> anyhow::Result<()> { diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 3a90d911c2..c78ee166f1 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -98,7 +98,7 @@ impl ConnectionWithCredentialsProvider { info!("Establishing a new connection..."); self.con = None; if let Some(f) = self.refresh_token_task.take() { - f.abort() + f.abort(); } let mut con = self .get_client() @@ -178,7 +178,7 @@ impl ConnectionWithCredentialsProvider { credentials_provider: Arc, ) -> anyhow::Result<()> { let (user, password) = credentials_provider.provide_credentials().await?; - redis::cmd("AUTH") + let _: () = redis::cmd("AUTH") .arg(user) .arg(password) .query_async(con) diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 87d723d17e..ad69246443 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -108,7 +108,6 @@ impl MessageHandler { } #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))] async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> { - use Notification::*; let payload: String = msg.get_payload()?; tracing::debug!(?payload, "received a message payload"); @@ -124,10 +123,10 @@ impl MessageHandler { }; tracing::debug!(?msg, "received a message"); match msg { - Cancel(cancel_session) => { + Notification::Cancel(cancel_session) => { tracing::Span::current().record( "session_id", - &tracing::field::display(cancel_session.session_id), + tracing::field::display(cancel_session.session_id), ); Metrics::get() .proxy @@ -153,12 +152,12 @@ impl MessageHandler { } _ => { invalidate_cache(self.cache.clone(), msg.clone()); - if matches!(msg, AllowedIpsUpdate { .. }) { + if matches!(msg, Notification::AllowedIpsUpdate { .. }) { Metrics::get() .proxy .redis_events_count .inc(RedisEventsCount::AllowedIpsUpdate); - } else if matches!(msg, PasswordUpdate { .. }) { + } else if matches!(msg, Notification::PasswordUpdate { .. }) { Metrics::get() .proxy .redis_events_count @@ -180,16 +179,16 @@ impl MessageHandler { } fn invalidate_cache(cache: Arc, msg: Notification) { - use Notification::*; match msg { - AllowedIpsUpdate { allowed_ips_update } => { - cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id) + Notification::AllowedIpsUpdate { allowed_ips_update } => { + cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id); } - PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project( - password_update.project_id, - password_update.role_name, - ), - Cancel(_) => unreachable!("cancel message should be handled separately"), + Notification::PasswordUpdate { password_update } => cache + .invalidate_role_secret_for_project( + password_update.project_id, + password_update.role_name, + ), + Notification::Cancel(_) => unreachable!("cancel message should be handled separately"), } } diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index 0811416ca2..60207fc824 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -42,10 +42,9 @@ pub enum Error { impl UserFacingError for Error { fn to_string_client(&self) -> String { - use Error::*; match self { - ChannelBindingFailed(m) => m.to_string(), - ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"), + Self::ChannelBindingFailed(m) => (*m).to_string(), + Self::ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"), _ => "authentication protocol violation".to_string(), } } diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs index 13d681de6d..6e2d3057ce 100644 --- a/proxy/src/sasl/channel_binding.rs +++ b/proxy/src/sasl/channel_binding.rs @@ -13,11 +13,10 @@ pub enum ChannelBinding { impl ChannelBinding { pub fn and_then(self, f: impl FnOnce(T) -> Result) -> Result, E> { - use ChannelBinding::*; Ok(match self { - NotSupportedClient => NotSupportedClient, - NotSupportedServer => NotSupportedServer, - Required(x) => Required(f(x)?), + Self::NotSupportedClient => ChannelBinding::NotSupportedClient, + Self::NotSupportedServer => ChannelBinding::NotSupportedServer, + Self::Required(x) => ChannelBinding::Required(f(x)?), }) } } @@ -25,11 +24,10 @@ impl ChannelBinding { impl<'a> ChannelBinding<&'a str> { // NB: FromStr doesn't work with lifetimes pub fn parse(input: &'a str) -> Option { - use ChannelBinding::*; Some(match input { - "n" => NotSupportedClient, - "y" => NotSupportedServer, - other => Required(other.strip_prefix("p=")?), + "n" => Self::NotSupportedClient, + "y" => Self::NotSupportedServer, + other => Self::Required(other.strip_prefix("p=")?), }) } } @@ -40,17 +38,16 @@ impl ChannelBinding { &self, get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>, ) -> Result, E> { - use ChannelBinding::*; Ok(match self { - NotSupportedClient => { + Self::NotSupportedClient => { // base64::encode("n,,") "biws".into() } - NotSupportedServer => { + Self::NotSupportedServer => { // base64::encode("y,,") "eSws".into() } - Required(mode) => { + Self::Required(mode) => { use std::io::Write; let mut cbind_input = vec![]; write!(&mut cbind_input, "p={mode},,",).unwrap(); diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index b9208f6f1f..2b5ae1785d 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -42,10 +42,9 @@ pub(super) enum ServerMessage { impl<'a> ServerMessage<&'a str> { pub(super) fn to_reply(&self) -> BeMessage<'a> { - use BeAuthenticationSaslMessage::*; BeMessage::AuthenticationSasl(match self { - ServerMessage::Continue(s) => Continue(s.as_bytes()), - ServerMessage::Final(s) => Final(s.as_bytes()), + ServerMessage::Continue(s) => BeAuthenticationSaslMessage::Continue(s.as_bytes()), + ServerMessage::Final(s) => BeAuthenticationSaslMessage::Final(s.as_bytes()), }) } } diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 862facb4e5..145e727a74 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -137,12 +137,12 @@ mod tests { #[tokio::test] async fn round_trip() { - run_round_trip_test("pencil", "pencil").await + run_round_trip_test("pencil", "pencil").await; } #[tokio::test] #[should_panic(expected = "password doesn't match")] async fn failure() { - run_round_trip_test("pencil", "eraser").await + run_round_trip_test("pencil", "eraser").await; } } diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index f2b794e5fe..944bb3c83e 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -98,8 +98,6 @@ mod tests { // q% of counts will be within p of the actual value let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); - dbg!(sketch.buckets.len()); - // insert a bunch of entries in a random order let mut ids2 = ids.clone(); while !ids2.is_empty() { @@ -158,7 +156,7 @@ mod tests { let N = 1021 * 4096; let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); - let memory = std::mem::size_of::() * sketch.buckets.len(); + let memory = size_of::() * sketch.buckets.len(); let time = sketch.depth; (memory, time) } diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index d0adbc780e..f2494379a5 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -210,23 +210,23 @@ impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; fn exchange(mut self, input: &str) -> sasl::Result> { - use {sasl::Step::*, ExchangeState::*}; + use {sasl::Step, ExchangeState}; match &self.state { - Initial(init) => { + ExchangeState::Initial(init) => { match init.transition(self.secret, &self.tls_server_end_point, input)? { - Continue(sent, msg) => { - self.state = SaltSent(sent); - Ok(Continue(self, msg)) + Step::Continue(sent, msg) => { + self.state = ExchangeState::SaltSent(sent); + Ok(Step::Continue(self, msg)) } - Success(x, _) => match x {}, - Failure(msg) => Ok(Failure(msg)), + Step::Success(x, _) => match x {}, + Step::Failure(msg) => Ok(Step::Failure(msg)), } } - SaltSent(sent) => { + ExchangeState::SaltSent(sent) => { match sent.transition(self.secret, &self.tls_server_end_point, input)? { - Success(keys, msg) => Ok(Success(keys, msg)), - Continue(x, _) => match x {}, - Failure(msg) => Ok(Failure(msg)), + Step::Success(keys, msg) => Ok(Step::Success(keys, msg)), + Step::Continue(x, _) => match x {}, + Step::Failure(msg) => Ok(Step::Failure(msg)), } } } diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index cf677a3334..5ecbbf7004 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -59,7 +59,7 @@ impl<'a> ClientFirstMessage<'a> { // https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14 if !username.is_empty() { - tracing::warn!(username, "scram username provided, but is not expected") + tracing::warn!(username, "scram username provided, but is not expected"); // TODO(conrad): // return None; } @@ -137,7 +137,7 @@ impl<'a> ClientFinalMessage<'a> { /// Build a response to [`ClientFinalMessage`]. pub fn build_server_final_message( &self, - signature_builder: SignatureBuilder, + signature_builder: SignatureBuilder<'_>, server_key: &ScramKey, ) -> String { let mut buf = String::from("v="); @@ -212,7 +212,7 @@ mod tests { #[test] fn parse_client_first_message_with_invalid_gs2_authz() { - assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none()) + assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none()); } #[test] diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs index a803ba7e1b..f690cc7738 100644 --- a/proxy/src/scram/pbkdf2.rs +++ b/proxy/src/scram/pbkdf2.rs @@ -84,6 +84,6 @@ mod tests { }; let expected = pbkdf2_hmac_array::(pass, salt, 600000); - assert_eq!(hash, expected) + assert_eq!(hash, expected); } } diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index 7701b869a3..fa3d3ccca2 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -270,7 +270,7 @@ fn thread_rt(pool: Arc, worker: Worker, index: usize) { .inc(ThreadPoolWorkerId(index)); // skip for now - worker.push(job) + worker.push(job); } } @@ -316,6 +316,6 @@ mod tests { 10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242, 178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140, ]; - assert_eq!(actual, expected) + assert_eq!(actual, expected); } } diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index efa999ed7d..b2bf93dc6d 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -10,6 +10,7 @@ mod json; mod sql_over_http; mod websocket; +use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; pub use conn_pool::GlobalConnPoolOptions; @@ -26,8 +27,9 @@ use rand::rngs::StdRng; use rand::SeedableRng; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio::time::timeout; -use tokio_rustls::{server::TlsStream, TlsAcceptor}; +use tokio_rustls::TlsAcceptor; use tokio_util::task::TaskTracker; use crate::cancellation::CancellationHandlerMain; @@ -41,7 +43,7 @@ use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; use std::net::{IpAddr, SocketAddr}; -use std::pin::pin; +use std::pin::{pin, Pin}; use std::sync::Arc; use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; @@ -86,18 +88,18 @@ pub async fn task_main( config, endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); - - let tls_config = match config.tls_config.as_ref() { - Some(config) => config, + let tls_acceptor: Arc = match config.tls_config.as_ref() { + Some(config) => { + let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config()); + // prefer http2, but support http/1.1 + tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; + Arc::new(tls_server_config) as Arc<_> + } None => { - warn!("TLS config is missing, WebSocket Secure server will not be started"); - return Ok(()); + warn!("TLS config is missing"); + Arc::new(NoTls) as Arc<_> } }; - let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config()); - // prefer http2, but support http/1.1 - tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; - let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into(); let connections = tokio_util::task::task_tracker::TaskTracker::new(); connections.close(); // allows `connections.wait to complete` @@ -120,7 +122,7 @@ pub async fn task_main( tracing::trace!("attempting to cancel a random connection"); if let Some(token) = config.http_config.cancel_set.take() { tracing::debug!("cancelling a random connection"); - token.cancel() + token.cancel(); } } @@ -176,16 +178,41 @@ pub async fn task_main( Ok(()) } +pub trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {} +impl AsyncReadWrite for T {} +pub type AsyncRW = Pin>; + +#[async_trait] +trait MaybeTlsAcceptor: Send + Sync + 'static { + async fn accept(self: Arc, conn: ChainRW) -> std::io::Result; +} + +#[async_trait] +impl MaybeTlsAcceptor for rustls::ServerConfig { + async fn accept(self: Arc, conn: ChainRW) -> std::io::Result { + Ok(Box::pin(TlsAcceptor::from(self).accept(conn).await?)) + } +} + +struct NoTls; + +#[async_trait] +impl MaybeTlsAcceptor for NoTls { + async fn accept(self: Arc, conn: ChainRW) -> std::io::Result { + Ok(Box::pin(conn)) + } +} + /// Handles the TCP startup lifecycle. /// 1. Parses PROXY protocol V2 /// 2. Handles TLS handshake async fn connection_startup( config: &ProxyConfig, - tls_acceptor: TlsAcceptor, + tls_acceptor: Arc, session_id: uuid::Uuid, conn: TcpStream, peer_addr: SocketAddr, -) -> Option<(TlsStream>, IpAddr)> { +) -> Option<(AsyncRW, IpAddr)> { // handle PROXY protocol let (conn, peer) = match read_proxy_protocol(conn).await { Ok(c) => c, @@ -198,7 +225,7 @@ async fn connection_startup( let peer_addr = peer.unwrap_or(peer_addr).ip(); let has_private_peer_addr = match peer_addr { IpAddr::V4(ip) => ip.is_private(), - _ => false, + IpAddr::V6(_) => false, }; info!(?session_id, %peer_addr, "accepted new TCP connection"); @@ -241,7 +268,7 @@ async fn connection_handler( cancellation_handler: Arc, endpoint_rate_limiter: Arc, cancellation_token: CancellationToken, - conn: TlsStream>, + conn: AsyncRW, peer_addr: IpAddr, session_id: uuid::Uuid, ) { @@ -326,7 +353,9 @@ async fn request_handler( .map(|s| s.to_string()); // Check if the request is a websocket upgrade request. - if framed_websockets::upgrade::is_upgrade_request(&request) { + if config.http_config.accept_websockets + && framed_websockets::upgrade::is_upgrade_request(&request) + { let ctx = RequestMonitoring::new( session_id, peer_addr, @@ -334,7 +363,7 @@ async fn request_handler( &config.region, ); - let span = ctx.span.clone(); + let span = ctx.span(); info!(parent: &span, "performing websocket upgrade"); let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request) @@ -367,7 +396,7 @@ async fn request_handler( crate::metrics::Protocol::Http, &config.region, ); - let span = ctx.span.clone(); + let span = ctx.span(); sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 86e64c0a38..295ea1a1c7 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -16,7 +16,10 @@ use crate::{ context::RequestMonitoring, error::{ErrorKind, ReportableError, UserFacingError}, intern::EndpointIdInt, - proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry}, + proxy::{ + connect_compute::ConnectMechanism, + retry::{CouldRetry, ShouldRetryWakeCompute}, + }, rate_limiter::EndpointRateLimiter, Host, }; @@ -32,15 +35,15 @@ pub struct PoolingBackend { impl PoolingBackend { pub async fn authenticate( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, config: &AuthenticationConfig, conn_info: &ConnInfo, ) -> Result { let user_info = conn_info.user_info.clone(); let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone()); let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; - if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(AuthError::ip_address_not_allowed(ctx.peer_addr)); + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + return Err(AuthError::ip_address_not_allowed(ctx.peer_addr())); } if !self .endpoint_rate_limiter @@ -97,7 +100,7 @@ impl PoolingBackend { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] pub async fn connect_to_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, conn_info: ConnInfo, keys: ComputeCredentials, force_new: bool, @@ -179,7 +182,7 @@ impl UserFacingError for HttpConnError { } } -impl ShouldRetry for HttpConnError { +impl CouldRetry for HttpConnError { fn could_retry(&self) -> bool { match self { HttpConnError::ConnectionError(e) => e.could_retry(), @@ -190,9 +193,11 @@ impl ShouldRetry for HttpConnError { HttpConnError::TooManyConnectionAttempts(_) => false, } } - fn should_retry_database_address(&self) -> bool { +} +impl ShouldRetryWakeCompute for HttpConnError { + fn should_retry_wake_compute(&self) -> bool { match self { - HttpConnError::ConnectionError(e) => e.should_retry_database_address(), + HttpConnError::ConnectionError(e) => e.should_retry_wake_compute(), // we never checked cache validity HttpConnError::TooManyConnectionAttempts(_) => false, _ => true, @@ -217,7 +222,7 @@ impl ConnectMechanism for TokioMechanism { async fn connect_once( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { @@ -231,12 +236,12 @@ impl ConnectMechanism for TokioMechanism { .dbname(&self.conn_info.dbname) .connect_timeout(timeout); - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let res = config.connect(tokio_postgres::NoTls).await; drop(pause); let (client, connection) = permit.release_result(res)?; - tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); + tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); Ok(poll_client( self.pool.clone(), ctx, diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 170bda062e..3478787995 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -377,7 +377,7 @@ impl GlobalConnPool { pub fn get( self: &Arc, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { let mut client: Option> = None; @@ -390,7 +390,7 @@ impl GlobalConnPool { .write() .get_conn_entry(conn_info.db_and_user()) { - client = Some(entry.conn) + client = Some(entry.conn); } let endpoint_pool = Arc::downgrade(&endpoint_pool); @@ -403,15 +403,15 @@ impl GlobalConnPool { tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); tracing::Span::current().record( "pid", - &tracing::field::display(client.inner.get_process_id()), + tracing::field::display(client.inner.get_process_id()), ); info!( cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), "pool: reusing connection '{conn_info}'" ); - client.session.send(ctx.session_id)?; + client.session.send(ctx.session_id())?; ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); - ctx.latency_timer.success(); + ctx.success(); return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } } @@ -465,19 +465,19 @@ impl GlobalConnPool { pub fn poll_client( global_pool: Arc>, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, conn_info: ConnInfo, client: C, mut connection: tokio_postgres::Connection, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { - let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol); - let mut session_id = ctx.session_id; + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); + let mut session_id = ctx.session_id(); let (tx, mut rx) = tokio::sync::watch::channel(session_id); let span = info_span!(parent: None, "connection", %conn_id); - let cold_start_info = ctx.cold_start_info; + let cold_start_info = ctx.cold_start_info(); span.in_scope(|| { info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); @@ -662,13 +662,13 @@ impl Discard<'_, C> { pub fn check_idle(&mut self, status: ReadyForQueryStatus) { let conn_info = &self.conn_info; if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { - info!("pool: throwing away connection '{conn_info}' because connection is not idle") + info!("pool: throwing away connection '{conn_info}' because connection is not idle"); } } pub fn discard(&mut self) { let conn_info = &self.conn_info; if std::mem::take(self.pool).strong_count() > 0 { - info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state") + info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); } } } @@ -758,6 +758,7 @@ mod tests { async fn test_pool() { let _ = env_logger::try_init(); let config = Box::leak(Box::new(crate::config::HttpConfig { + accept_websockets: false, pool_options: GlobalConnPoolOptions { max_conns_per_endpoint: 2, gc_epoch: Duration::from_secs(1), @@ -766,7 +767,6 @@ mod tests { opt_in: false, max_total_conns: 3, }, - request_timeout: Duration::from_secs(1), cancel_set: CancelSet::new(0), client_conn_threshold: u64::MAX, })); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 7a99aeb759..bbfed90f39 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -18,7 +18,7 @@ use hyper1::Response; use hyper1::StatusCode; use hyper1::{HeaderMap, Request}; use pq_proto::StartupMessageParamsBuilder; -use serde_json::json; +use serde::Serialize; use serde_json::Value; use tokio::time; use tokio_postgres::error::DbError; @@ -32,7 +32,9 @@ use tokio_postgres::Transaction; use tokio_util::sync::CancellationToken; use tracing::error; use tracing::info; +use typed_json::json; use url::Url; +use urlencoding; use utils::http::error::ApiError; use crate::auth::backend::ComputeUserInfo; @@ -143,9 +145,9 @@ impl UserFacingError for ConnInfoError { } fn get_conn_info( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, headers: &HeaderMap, - tls: &TlsConfig, + tls: Option<&TlsConfig>, ) -> Result { // HTTP only uses cleartext (for now and likely always) ctx.set_auth_method(crate::context::AuthMethod::Cleartext); @@ -167,7 +169,8 @@ fn get_conn_info( .path_segments() .ok_or(ConnInfoError::MissingDbName)?; - let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into(); + let dbname: DbName = + urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into(); ctx.set_dbname(dbname.clone()); let username = RoleName::from(urlencoding::decode(connection_url.username())?); @@ -181,12 +184,22 @@ fn get_conn_info( .ok_or(ConnInfoError::MissingPassword)?; let password = urlencoding::decode_binary(password.as_bytes()); - let hostname = connection_url - .host_str() - .ok_or(ConnInfoError::MissingHostname)?; - - let endpoint = - endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?; + let endpoint = match connection_url.host() { + Some(url::Host::Domain(hostname)) => { + if let Some(tls) = tls { + endpoint_sni(hostname, &tls.common_names)? + .ok_or(ConnInfoError::MalformedEndpoint)? + } else { + hostname + .split_once(".") + .map_or(hostname, |(prefix, _)| prefix) + .into() + } + } + Some(url::Host::Ipv4(_)) | Some(url::Host::Ipv6(_)) | None => { + return Err(ConnInfoError::MissingHostname) + } + }; ctx.set_endpoint_id(endpoint.clone()); let pairs = connection_url.query_pairs(); @@ -222,12 +235,12 @@ fn get_conn_info( // TODO: return different http error codes pub async fn handle( config: &'static ProxyConfig, - mut ctx: RequestMonitoring, + ctx: RequestMonitoring, request: Request, backend: Arc, cancel: CancellationToken, ) -> Result>, ApiError> { - let result = handle_inner(cancel, config, &mut ctx, request, backend).await; + let result = handle_inner(cancel, config, &ctx, request, backend).await; let mut response = match result { Ok(r) => { @@ -262,13 +275,8 @@ pub async fn handle( | SqlOverHttpError::Postgres(e) => e.as_db_error(), _ => None, }; - fn get<'a, T: serde::Serialize>( - db: Option<&'a DbError>, - x: impl FnOnce(&'a DbError) -> T, - ) -> Value { - db.map(x) - .and_then(|t| serde_json::to_value(t).ok()) - .unwrap_or_default() + fn get<'a, T: Default>(db: Option<&'a DbError>, x: impl FnOnce(&'a DbError) -> T) -> T { + db.map(x).unwrap_or_default() } if let Some(db_error) = db_error { @@ -277,17 +285,11 @@ pub async fn handle( let position = db_error.and_then(|db| db.position()); let (position, internal_position, internal_query) = match position { - Some(ErrorPosition::Original(position)) => ( - Value::String(position.to_string()), - Value::Null, - Value::Null, - ), - Some(ErrorPosition::Internal { position, query }) => ( - Value::Null, - Value::String(position.to_string()), - Value::String(query.clone()), - ), - None => (Value::Null, Value::Null, Value::Null), + Some(ErrorPosition::Original(position)) => (Some(position.to_string()), None, None), + Some(ErrorPosition::Internal { position, query }) => { + (None, Some(position.to_string()), Some(query.clone())) + } + None => (None, None, None), }; let code = get(db_error, |db| db.code().code()); @@ -491,13 +493,16 @@ fn map_isolation_level_to_headers(level: IsolationLevel) -> Option async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, request: Request, backend: Arc, ) -> Result>, SqlOverHttpError> { - let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol); + let _requeset_gauge = Metrics::get() + .proxy + .connection_requests + .guard(ctx.protocol()); info!( - protocol = %ctx.protocol, + protocol = %ctx.protocol(), "handling interactive connection from client" ); @@ -507,7 +512,7 @@ async fn handle_inner( let headers = request.headers(); // TLS config should be there. - let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?; + let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?; info!(user = conn_info.user_info.user.as_str(), "credentials"); // Allow connection pooling only if explicitly requested @@ -553,7 +558,7 @@ async fn handle_inner( .await?; // not strictly necessary to mark success here, // but it's just insurance for if we forget it somewhere else - ctx.latency_timer.success(); + ctx.success(); Ok::<_, HttpConnError>(client) } .map_err(SqlOverHttpError::from), @@ -577,10 +582,8 @@ async fn handle_inner( .status(StatusCode::OK) .header(header::CONTENT_TYPE, "application/json"); - // - // Now execute the query and return the result - // - let result = match payload { + // Now execute the query and return the result. + let json_output = match payload { Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?, Payload::Batch(statements) => { if parsed_headers.txn_read_only { @@ -604,11 +607,9 @@ async fn handle_inner( let metrics = client.metrics(); - // how could this possibly fail - let body = serde_json::to_string(&result).expect("json serialization should not fail"); - let len = body.len(); + let len = json_output.len(); let response = response - .body(Full::new(Bytes::from(body))) + .body(Full::new(Bytes::from(json_output))) // only fails if invalid status code or invalid header/values are given. // these are not user configurable so it cannot fail dynamically .expect("building response payload should not fail"); @@ -630,7 +631,7 @@ impl QueryData { cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, - ) -> Result { + ) -> Result { let (inner, mut discard) = client.inner(); let cancel_token = inner.cancel_token(); @@ -643,7 +644,10 @@ impl QueryData { // The query successfully completed. Either::Left((Ok((status, results)), __not_yet_cancelled)) => { discard.check_idle(status); - Ok(results) + + let json_output = + serde_json::to_string(&results).expect("json serialization should not fail"); + Ok(json_output) } // The query failed with an error Either::Left((Err(e), __not_yet_cancelled)) => { @@ -661,7 +665,10 @@ impl QueryData { // query successed before it was cancelled. Ok(Ok((status, results))) => { discard.check_idle(status); - Ok(results) + + let json_output = serde_json::to_string(&results) + .expect("json serialization should not fail"); + Ok(json_output) } // query failed or was cancelled. Ok(Err(error)) => { @@ -695,7 +702,7 @@ impl BatchQueryData { cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, - ) -> Result { + ) -> Result { info!("starting transaction"); let (inner, mut discard) = client.inner(); let cancel_token = inner.cancel_token(); @@ -717,9 +724,9 @@ impl BatchQueryData { e })?; - let results = + let json_output = match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { - Ok(results) => { + Ok(json_output) => { info!("commit"); let status = transaction.commit().await.map_err(|e| { // if we cannot commit - for now don't return connection to pool @@ -728,7 +735,7 @@ impl BatchQueryData { e })?; discard.check_idle(status); - results + json_output } Err(SqlOverHttpError::Cancelled(_)) => { if let Err(err) = cancel_token.cancel_query(NoTls).await { @@ -752,7 +759,7 @@ impl BatchQueryData { } }; - Ok(json!({ "results": results })) + Ok(json_output) } } @@ -761,7 +768,7 @@ async fn query_batch( transaction: &Transaction<'_>, queries: BatchQueryData, parsed_headers: HttpHeaders, -) -> Result, SqlOverHttpError> { +) -> Result { let mut results = Vec::with_capacity(queries.queries.len()); let mut current_size = 0; for stmt in queries.queries { @@ -786,7 +793,11 @@ async fn query_batch( } } } - Ok(results) + + let results = json!({ "results": results }); + let json_output = serde_json::to_string(&results).expect("json serialization should not fail"); + + Ok(json_output) } async fn query_to_json( @@ -794,7 +805,7 @@ async fn query_to_json( data: QueryData, current_size: &mut usize, parsed_headers: HttpHeaders, -) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> { +) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> { info!("executing query"); let query_params = data.params; let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?); @@ -837,13 +848,14 @@ async fn query_to_json( "finished reading rows" ); - let mut fields = vec![]; - let mut columns = vec![]; + let columns_len = row_stream.columns().len(); + let mut fields = Vec::with_capacity(columns_len); + let mut columns = Vec::with_capacity(columns_len); for c in row_stream.columns() { fields.push(json!({ - "name": Value::String(c.name().to_owned()), - "dataTypeID": Value::Number(c.type_().oid().into()), + "name": c.name().to_owned(), + "dataTypeID": c.type_().oid(), "tableID": c.table_oid(), "columnID": c.column_id(), "dataTypeSize": c.type_size(), @@ -861,15 +873,14 @@ async fn query_to_json( .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode)) .collect::, _>>()?; - // resulting JSON format is based on the format of node-postgres result - Ok(( - ready, - json!({ - "command": command_tag_name, - "rowCount": command_tag_count, - "rows": rows, - "fields": fields, - "rowAsArray": array_mode, - }), - )) + // Resulting JSON format is based on the format of node-postgres result. + let results = json!({ + "command": command_tag_name.to_string(), + "rowCount": command_tag_count, + "rows": rows, + "fields": fields, + "rowAsArray": array_mode, + }); + + Ok((ready, results)) } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 0e9772733d..4fba4d141c 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,3 +1,4 @@ +use crate::proxy::ErrorSource; use crate::{ cancellation::CancellationHandlerMain, config::ProxyConfig, @@ -7,6 +8,7 @@ use crate::{ proxy::{handle_client, ClientMode}, rate_limiter::EndpointRateLimiter, }; +use anyhow::Context as _; use bytes::{Buf, BufMut, Bytes, BytesMut}; use framed_websockets::{Frame, OpCode, WebSocketServer}; use futures::{Sink, Stream}; @@ -127,7 +129,7 @@ impl AsyncBufRead for WebSocketRw { pub async fn serve_websocket( config: &'static ProxyConfig, - mut ctx: RequestMonitoring, + ctx: RequestMonitoring, websocket: OnUpgrade, cancellation_handler: Arc, endpoint_rate_limiter: Arc, @@ -143,7 +145,7 @@ pub async fn serve_websocket( let res = Box::pin(handle_client( config, - &mut ctx, + &ctx, cancellation_handler, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, @@ -165,7 +167,11 @@ pub async fn serve_websocket( Ok(Some(p)) => { ctx.set_success(); ctx.log_connect(); - p.proxy_pass().await + match p.proxy_pass().await { + Ok(()) => Ok(()), + Err(ErrorSource::Client(err)) => Err(err).context("client"), + Err(ErrorSource::Compute(err)) => Err(err).context("compute"), + } } } } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 690e92ffb1..7809d2e574 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -234,7 +234,7 @@ impl Stream { .await .inspect_err(|_| { if record_handshake_error { - Metrics::get().proxy.tls_handshake_failures.inc() + Metrics::get().proxy.tls_handshake_failures.inc(); } })?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 92c64bb8ad..202fe8de1f 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -12,7 +12,7 @@ impl ApiUrl { } /// See [`url::Url::path_segments_mut`]. - pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut { + pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> { // We've already verified that it works during construction. self.0.path_segments_mut().expect("bad API url") } diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 56ed2145dc..a8735fe0bb 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -357,11 +357,15 @@ pub async fn task_backup( info!("metrics backup has shut down"); } // Even if the remote storage is not configured, we still want to clear the metrics. - let storage = backup_config - .remote_storage_config - .as_ref() - .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init")) - .transpose()?; + let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() { + Some( + GenericRemoteStorage::from_config(config) + .await + .context("remote storage init")?, + ) + } else { + None + }; let mut ticker = tokio::time::interval(backup_config.interval); let mut prev = Utc::now(); let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index bba5494cfe..3bd8f4c8ef 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -36,7 +36,7 @@ impl Default for Waiters { } impl Waiters { - pub fn register(&self, key: String) -> Result, RegisterError> { + pub fn register(&self, key: String) -> Result, RegisterError> { let (tx, rx) = oneshot::channel(); self.0 @@ -111,7 +111,7 @@ mod tests { let waiters = Arc::clone(&waiters); let notifier = tokio::spawn(async move { - waiters.notify(key, Default::default())?; + waiters.notify(key, ())?; Ok(()) }); diff --git a/pyproject.toml b/pyproject.toml index c7f1a07512..ad3961ef55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,7 @@ [tool.poetry] -name = "neon" -version = "0.1.0" description = "" authors = [] +package-mode = false [tool.poetry.dependencies] python = "^3.9" @@ -33,7 +32,7 @@ psutil = "^5.9.4" types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" -aiohttp = "3.9.4" +aiohttp = "3.10.2" pytest-rerunfailures = "^13.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" @@ -41,6 +40,8 @@ zstandard = "^0.21.0" httpx = {extras = ["http2"], version = "^0.26.0"} pytest-repeat = "^0.9.3" websockets = "^12.0" +clickhouse-connect = "^0.7.16" +kafka-python = "^2.0.2" [tool.poetry.group.dev.dependencies] mypy = "==1.3.0" @@ -74,6 +75,7 @@ module = [ "allure.*", "allure_commons.*", "allure_pytest.*", + "kafka.*", ] ignore_missing_imports = true diff --git a/rust-toolchain.toml b/rust-toolchain.toml index dcae25a287..368b8d300a 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.79.0" +channel = "1.80.1" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index a650d5e207..0fdb3147bf 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -23,7 +23,6 @@ clap = { workspace = true, features = ["derive"] } const_format.workspace = true crc32c.workspace = true fail.workspace = true -fs2.workspace = true git-version.workspace = true hex.workspace = true humantime.workspace = true @@ -41,6 +40,8 @@ serde.workspace = true serde_json.workspace = true serde_with.workspace = true signal-hook.workspace = true +strum.workspace = true +strum_macros.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["fs"] } tokio-util = { workspace = true } diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index dd9058c468..b8bc3f3e06 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -12,13 +12,15 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } Ok(()) } - (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError( - format!( - "JWT scope '{:?}' is ineligible for Safekeeper auth", - claims.scope - ) - .into(), - )), + (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => { + Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Safekeeper auth", + claims.scope + ) + .into(), + )) + } (Scope::SafekeeperData, _) => Ok(()), } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 86238c7292..41c2d3fe08 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -12,7 +12,6 @@ use sd_notify::NotifyState; use tokio::runtime::Handle; use tokio::signal::unix::{signal, SignalKind}; use tokio::task::JoinError; -use toml_edit::Document; use utils::logging::SecretString; use std::env::{var, VarError}; @@ -28,7 +27,8 @@ use utils::pid_file; use metrics::set_build_info_metric; use safekeeper::defaults::{ - DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, + DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; use safekeeper::http; @@ -125,7 +125,7 @@ struct Args { peer_recovery: bool, /// Remote storage configuration for WAL backup (offloading to s3) as TOML /// inline table, e.g. - /// {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "", "bucket_region":"", "concurrency_limit": 119} + /// {max_concurrent_syncs = 17, max_sync_errors = 13, bucket_name = "", bucket_region = "", concurrency_limit = 119} /// Safekeeper offloads WAL to /// [prefix_in_bucket/]//, mirroring /// structure on the file system. @@ -170,10 +170,6 @@ struct Args { /// still needed for existing replication connection. #[arg(long)] walsenders_keep_horizon: bool, - /// Enable partial backup. If disabled, safekeeper will not upload partial - /// segments to remote storage. - #[arg(long)] - partial_backup_enabled: bool, /// Controls how long backup will wait until uploading the partial segment. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)] partial_backup_timeout: Duration, @@ -181,6 +177,24 @@ struct Args { /// be used in tests. #[arg(long)] disable_periodic_broker_push: bool, + /// Enable automatic switching to offloaded state. + #[arg(long)] + enable_offload: bool, + /// Delete local WAL files after offloading. When disabled, they will be left on disk. + #[arg(long)] + delete_offloaded_wal: bool, + /// Pending updates to control file will be automatically saved after this interval. + #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_CONTROL_FILE_SAVE_INTERVAL)] + control_file_save_interval: Duration, + /// Number of allowed concurrent uploads of partial segments to remote storage. + #[arg(long, default_value = DEFAULT_PARTIAL_BACKUP_CONCURRENCY)] + partial_backup_concurrency: usize, + /// How long a timeline must be resident before it is eligible for eviction. + /// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction, + /// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again, + /// if it weren't for `eviction_min_resident` preventing that. + #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)] + eviction_min_resident: Duration, } // Like PathBufValueParser, but allows empty string. @@ -328,9 +342,13 @@ async fn main() -> anyhow::Result<()> { sk_auth_token, current_thread_runtime: args.current_thread_runtime, walsenders_keep_horizon: args.walsenders_keep_horizon, - partial_backup_enabled: args.partial_backup_enabled, partial_backup_timeout: args.partial_backup_timeout, disable_periodic_broker_push: args.disable_periodic_broker_push, + enable_offload: args.enable_offload, + delete_offloaded_wal: args.delete_offloaded_wal, + control_file_save_interval: args.control_file_save_interval, + partial_backup_concurrency: args.partial_backup_concurrency, + eviction_min_resident: args.eviction_min_resident, }; // initialize sentry if SENTRY_DSN is provided @@ -394,7 +412,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - wal_backup::init_remote_storage(&conf); + wal_backup::init_remote_storage(&conf).await; // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = @@ -428,6 +446,19 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { .map(|res| ("WAL service main".to_owned(), res)); tasks_handles.push(Box::pin(wal_service_handle)); + let timeline_housekeeping_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle()) + .spawn(async move { + const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24); + loop { + tokio::time::sleep(TOMBSTONE_TTL).await; + GlobalTimelines::housekeeping(&TOMBSTONE_TTL); + } + }) + .map(|res| ("Timeline map housekeeping".to_owned(), res)); + tasks_handles.push(Box::pin(timeline_housekeeping_handle)); + if let Some(pg_listener_tenant_only) = pg_listener_tenant_only { let conf_ = conf.clone(); let wal_service_handle = current_thread_rt @@ -535,16 +566,8 @@ fn set_id(workdir: &Utf8Path, given_id: Option) -> Result { Ok(my_id) } -// Parse RemoteStorage from TOML table. fn parse_remote_storage(storage_conf: &str) -> anyhow::Result { - // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse - let storage_conf_toml = format!("remote_storage = {storage_conf}"); - let parsed_toml = storage_conf_toml.parse::()?; // parse - let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again - RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| { - // XXX: Don't print the original toml here, there might be some sensitive data - parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config") - }) + RemoteStorageConfig::from_toml(&storage_conf.parse()?) } #[test] diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 9d65187350..c551cd3122 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -12,21 +12,22 @@ use std::ops::Deref; use std::path::Path; use std::time::Instant; +use crate::control_file_upgrade::downgrade_v9_to_v8; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; -use crate::state::TimelinePersistentState; +use crate::state::{EvictionState, TimelinePersistentState}; use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir}; use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 8; +pub const SK_FORMAT_VERSION: u32 = 9; // contains persistent metadata for safekeeper pub const CONTROL_FILE_NAME: &str = "safekeeper.control"; // needed to atomically update the state using `rename` const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; -pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); +pub const CHECKSUM_SIZE: usize = size_of::(); /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. @@ -71,6 +72,9 @@ impl FileStorage { conf: &SafeKeeperConf, state: TimelinePersistentState, ) -> Result { + // we don't support creating new timelines in offloaded state + assert!(matches!(state.eviction_state, EvictionState::Present)); + let store = FileStorage { timeline_dir, no_sync: conf.no_sync, @@ -102,7 +106,7 @@ impl FileStorage { } /// Load control file from given directory. - pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result { + fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result { let path = timeline_dir.join(CONTROL_FILE_NAME); Self::load_control_file(path) } @@ -160,6 +164,30 @@ impl Deref for FileStorage { } } +impl TimelinePersistentState { + pub(crate) fn write_to_buf(&self) -> Result> { + let mut buf: Vec = Vec::new(); + WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; + + if self.eviction_state == EvictionState::Present { + // temp hack for forward compatibility + const PREV_FORMAT_VERSION: u32 = 8; + let prev = downgrade_v9_to_v8(self); + WriteBytesExt::write_u32::(&mut buf, PREV_FORMAT_VERSION)?; + prev.ser_into(&mut buf)?; + } else { + // otherwise, we write the current format version + WriteBytesExt::write_u32::(&mut buf, SK_FORMAT_VERSION)?; + self.ser_into(&mut buf)?; + } + + // calculate checksum before resize + let checksum = crc32c::crc32c(&buf); + buf.extend_from_slice(&checksum.to_le_bytes()); + Ok(buf) + } +} + #[async_trait::async_trait] impl Storage for FileStorage { /// Persists state durably to the underlying storage. @@ -176,14 +204,8 @@ impl Storage for FileStorage { &control_partial_path ) })?; - let mut buf: Vec = Vec::new(); - WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; - WriteBytesExt::write_u32::(&mut buf, SK_FORMAT_VERSION)?; - s.ser_into(&mut buf)?; - // calculate checksum before resize - let checksum = crc32c::crc32c(&buf); - buf.extend_from_slice(&checksum.to_le_bytes()); + let buf: Vec = s.write_to_buf()?; control_partial.write_all(&buf).await.with_context(|| { format!( diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 8f4dfe9b43..a4b4670e42 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,7 +1,7 @@ //! Code to deal with safekeeper control file upgrades use crate::{ safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn}, - state::{PersistedPeers, TimelinePersistentState}, + state::{EvictionState, PersistedPeers, TimelinePersistentState}, wal_backup_partial, }; use anyhow::{bail, Result}; @@ -183,6 +183,55 @@ pub struct SafeKeeperStateV7 { pub peers: PersistedPeers, } +/// Persistent information stored on safekeeper node about timeline. +/// On disk data is prefixed by magic and format version and followed by checksum. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SafeKeeperStateV8 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + /// Peers and their state as we remember it. Knowing peers themselves is + /// fundamental; but state is saved here only for informational purposes and + /// obviously can be stale. (Currently not saved at all, but let's provision + /// place to have less file version upgrades). + pub peers: PersistedPeers, + /// Holds names of partial segments uploaded to remote storage. Used to + /// clean up old objects without leaving garbage in remote storage. + pub partial_backup: wal_backup_partial::State, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -213,6 +262,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result Result Result Result Result Result SafeKeeperStateV8 { + assert!(state.eviction_state == EvictionState::Present); + SafeKeeperStateV8 { + tenant_id: state.tenant_id, + timeline_id: state.timeline_id, + acceptor_state: state.acceptor_state.clone(), + server: state.server.clone(), + proposer_uuid: state.proposer_uuid, + timeline_start_lsn: state.timeline_start_lsn, + local_start_lsn: state.local_start_lsn, + commit_lsn: state.commit_lsn, + backup_lsn: state.backup_lsn, + peer_horizon_lsn: state.peer_horizon_lsn, + remote_consistent_lsn: state.remote_consistent_lsn, + peers: state.peers.clone(), + partial_backup: state.partial_backup.clone(), + } +} + #[cfg(test)] mod tests { use std::str::FromStr; diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 51cf4db6b5..220988c3ce 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -15,7 +15,7 @@ use crate::{ control_file::{FileStorage, Storage}, pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline}, state::TimelinePersistentState, - timeline::{FullAccessTimeline, Timeline, TimelineError}, + timeline::{Timeline, TimelineError, WalResidentTimeline}, wal_backup::copy_s3_segments, wal_storage::{wal_file_paths, WalReader}, GlobalTimelines, @@ -46,7 +46,7 @@ pub async fn handle_request(request: Request) -> Result<()> { } } - let source_tli = request.source.full_access_guard().await?; + let source_tli = request.source.wal_residence_guard().await?; let conf = &GlobalTimelines::get_global_config(); let ttid = request.destination_ttid; @@ -74,10 +74,16 @@ pub async fn handle_request(request: Request) -> Result<()> { assert!(flush_lsn >= start_lsn); if request.until_lsn > flush_lsn { - bail!("requested LSN is beyond the end of the timeline"); + bail!(format!( + "requested LSN {} is beyond the end of the timeline {}", + request.until_lsn, flush_lsn + )); } if request.until_lsn < start_lsn { - bail!("requested LSN is before the start of the timeline"); + bail!(format!( + "requested LSN {} is before the start of the timeline {}", + request.until_lsn, start_lsn + )); } if request.until_lsn > commit_lsn { @@ -159,7 +165,7 @@ pub async fn handle_request(request: Request) -> Result<()> { } async fn copy_disk_segments( - tli: &FullAccessTimeline, + tli: &WalResidentTimeline, wal_seg_size: usize, start_lsn: Lsn, end_lsn: Lsn, @@ -183,7 +189,7 @@ async fn copy_disk_segments( let copy_end = copy_end - segment_start; let wal_file_path = { - let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size)?; + let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size); if segment == last_segment { partial diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index 062ff4b3db..15b0272cd9 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -28,7 +28,8 @@ use crate::send_wal::WalSenderState; use crate::state::TimelineMemState; use crate::state::TimelinePersistentState; use crate::timeline::get_timeline_dir; -use crate::timeline::FullAccessTimeline; +use crate::timeline::WalResidentTimeline; +use crate::timeline_manager; use crate::GlobalTimelines; use crate::SafeKeeperConf; @@ -168,6 +169,7 @@ pub struct Memory { pub last_removed_segno: XLogSegNo, pub epoch_start_lsn: Lsn, pub mem_state: TimelineMemState, + pub mgr_status: timeline_manager::Status, // PhysicalStorage state. pub write_lsn: Lsn, @@ -326,7 +328,7 @@ pub struct TimelineDigest { } pub async fn calculate_digest( - tli: &FullAccessTimeline, + tli: &WalResidentTimeline, request: TimelineDigestRequest, ) -> Result { if request.from_lsn > request.until_lsn { diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index f45bfb95fa..2c519433ef 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -143,7 +143,12 @@ impl postgres_backend::Handler self.tenant_id.unwrap_or(TenantId::from([0u8; 16])), self.timeline_id.unwrap_or(TimelineId::from([0u8; 16])), ); - tracing::Span::current().record("ttid", tracing::field::display(ttid)); + tracing::Span::current() + .record("ttid", tracing::field::display(ttid)) + .record( + "application_name", + tracing::field::debug(self.appname.clone()), + ); Ok(()) } else { diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs index 0bb31c200d..c56f7880d4 100644 --- a/safekeeper/src/http/client.rs +++ b/safekeeper/src/http/client.rs @@ -10,7 +10,7 @@ use reqwest::{IntoUrl, Method, StatusCode}; use utils::{ http::error::HttpErrorBody, - id::{TenantId, TimelineId}, + id::{NodeId, TenantId, TimelineId}, logging::SecretString, }; @@ -97,10 +97,11 @@ impl Client { &self, tenant_id: TenantId, timeline_id: TimelineId, + stream_to: NodeId, ) -> Result { let uri = format!( - "{}/v1/tenant/{}/timeline/{}/snapshot", - self.mgmt_api_endpoint, tenant_id, timeline_id + "{}/v1/tenant/{}/timeline/{}/snapshot/{}", + self.mgmt_api_endpoint, tenant_id, timeline_id, stream_to.0 ); self.get(&uri).await } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 3f2cd97ccd..d11815f6ef 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -114,6 +114,16 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res }) } +/// List all (not deleted) timelines. +async fn timeline_list_handler(request: Request) -> Result, ApiError> { + check_permission(&request, None)?; + let res: Vec = GlobalTimelines::get_all() + .iter() + .map(|tli| tli.ttid) + .collect(); + json_response(StatusCode::OK, res) +} + /// Report info about timeline. async fn timeline_status_handler(request: Request) -> Result, ApiError> { let ttid = TenantTimelineId::new( @@ -205,6 +215,7 @@ async fn timeline_pull_handler(mut request: Request) -> Result) -> Result, ApiError> { + let destination = parse_request_param(&request, "destination_id")?; let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, @@ -214,10 +225,10 @@ async fn timeline_snapshot_handler(request: Request) -> Result) -> Result) -> Result) -> Result RouterBuilder .post("/v1/tenant/timeline", |r| { request_span(r, timeline_create_handler) }) + .get("/v1/tenant/timeline", |r| { + request_span(r, timeline_list_handler) + }) .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { request_span(r, timeline_status_handler) }) @@ -565,7 +585,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder request_span(r, tenant_delete_handler) }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot", + "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id", |r| request_span(r, timeline_snapshot_handler), ) .post("/v1/pull_timeline", |r| { diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 27e54776e0..7fe924a08e 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -21,7 +21,7 @@ use crate::safekeeper::{ }; use crate::safekeeper::{Term, TermHistory, TermLsn}; use crate::state::TimelinePersistentState; -use crate::timeline::FullAccessTimeline; +use crate::timeline::WalResidentTimeline; use crate::GlobalTimelines; use postgres_backend::PostgresBackend; use postgres_ffi::encode_logical_message; @@ -102,7 +102,7 @@ pub async fn handle_json_ctrl( async fn prepare_safekeeper( ttid: TenantTimelineId, pg_version: u32, -) -> anyhow::Result { +) -> anyhow::Result { let tli = GlobalTimelines::create( ttid, ServerInfo { @@ -115,11 +115,11 @@ async fn prepare_safekeeper( ) .await?; - tli.full_access_guard().await + tli.wal_residence_guard().await } async fn send_proposer_elected( - tli: &FullAccessTimeline, + tli: &WalResidentTimeline, term: Term, lsn: Lsn, ) -> anyhow::Result<()> { @@ -151,7 +151,7 @@ pub struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. pub async fn append_logical_message( - tli: &FullAccessTimeline, + tli: &WalResidentTimeline, msg: &AppendLogicalMessage, ) -> anyhow::Result { let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index cbd67f0064..2e11a279ca 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -21,6 +21,7 @@ pub mod json_ctrl; pub mod metrics; pub mod patch_control_file; pub mod pull_timeline; +pub mod rate_limit; pub mod receive_wal; pub mod recovery; pub mod remove_wal; @@ -28,6 +29,8 @@ pub mod safekeeper; pub mod send_wal; pub mod state; pub mod timeline; +pub mod timeline_eviction; +pub mod timeline_guard; pub mod timeline_manager; pub mod timelines_set; pub mod wal_backup; @@ -49,6 +52,14 @@ pub mod defaults { pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms"; pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20); pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m"; + pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s"; + pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5"; + pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2; + + // By default, our required residency before eviction is the same as the period that passes + // before uploading a partial segment, so that in normal operation the eviction can happen + // as soon as we have done the partial segment upload. + pub const DEFAULT_EVICTION_MIN_RESIDENT: &str = DEFAULT_PARTIAL_BACKUP_TIMEOUT; } #[derive(Debug, Clone)] @@ -82,9 +93,13 @@ pub struct SafeKeeperConf { pub sk_auth_token: Option, pub current_thread_runtime: bool, pub walsenders_keep_horizon: bool, - pub partial_backup_enabled: bool, pub partial_backup_timeout: Duration, pub disable_periodic_broker_push: bool, + pub enable_offload: bool, + pub delete_offloaded_wal: bool, + pub control_file_save_interval: Duration, + pub partial_backup_concurrency: usize, + pub eviction_min_resident: Duration, } impl SafeKeeperConf { @@ -121,9 +136,13 @@ impl SafeKeeperConf { max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, current_thread_runtime: false, walsenders_keep_horizon: false, - partial_backup_enabled: false, partial_backup_timeout: Duration::from_secs(0), disable_periodic_broker_push: false, + enable_offload: false, + delete_offloaded_wal: false, + control_file_save_interval: Duration::from_secs(1), + partial_backup_concurrency: 1, + eviction_min_resident: Duration::ZERO, } } } @@ -154,15 +173,6 @@ pub static BROKER_RUNTIME: Lazy = Lazy::new(|| { .expect("Failed to create broker runtime") }); -pub static WAL_REMOVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("WAL remover") - .worker_threads(1) - .enable_all() - .build() - .expect("Failed to create broker runtime") -}); - pub static WAL_BACKUP_RUNTIME: Lazy = Lazy::new(|| { tokio::runtime::Builder::new_multi_thread() .thread_name("WAL backup worker") @@ -170,12 +180,3 @@ pub static WAL_BACKUP_RUNTIME: Lazy = Lazy::new(|| { .build() .expect("Failed to create WAL backup runtime") }); - -pub static METRICS_SHIFTER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("metric shifter") - .worker_threads(1) - .enable_all() - .build() - .expect("Failed to create broker runtime") -}); diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 1e965393e3..aa2bafbe92 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -5,15 +5,15 @@ use std::{ time::{Instant, SystemTime}, }; -use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; +use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_FSYNC_SECONDS_BUCKETS}; use anyhow::Result; use futures::Future; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter, register_int_counter_pair, register_int_counter_pair_vec, - register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, - IntGaugeVec, + register_histogram_vec, register_int_counter, register_int_counter_pair, + register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter, + IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; @@ -48,7 +48,7 @@ pub static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_write_wal_seconds", "Seconds spent writing and syncing WAL to a disk in a single request", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_write_wal_seconds histogram") }); @@ -56,7 +56,7 @@ pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_flush_wal_seconds", "Seconds spent syncing WAL to a disk", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_flush_wal_seconds histogram") }); @@ -64,10 +64,28 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_persist_control_file_seconds", "Seconds to persist and sync control file", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") }); +pub static WAL_STORAGE_OPERATION_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "safekeeper_wal_storage_operation_seconds", + "Seconds spent on WAL storage operations", + &["operation"], + DISK_FSYNC_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_wal_storage_operation_seconds histogram vec") +}); +pub static MISC_OPERATION_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "safekeeper_misc_operation_seconds", + "Seconds spent on miscellaneous operations", + &["operation"], + DISK_FSYNC_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_misc_operation_seconds histogram vec") +}); pub static PG_IO_BYTES: Lazy = Lazy::new(|| { register_int_counter_vec!( "safekeeper_pg_io_bytes_total", @@ -126,7 +144,7 @@ pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_broker_push_update_seconds", "Seconds to push all timeline updates to the broker", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec") }); @@ -187,6 +205,32 @@ pub static WAL_BACKUP_TASKS: Lazy = Lazy::new(|| { .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter") }); +// Metrics collected on operations on the storage repository. +#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum EvictionEvent { + Evict, + Restore, +} + +pub(crate) static EVICTION_EVENTS_STARTED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_eviction_events_started_total", + "Number of eviction state changes, incremented when they start", + &["kind"] + ) + .expect("Failed to register metric") +}); + +pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_eviction_events_completed_total", + "Number of eviction state changes, incremented when they complete", + &["kind"] + ) + .expect("Failed to register metric") +}); + pub const LABEL_UNKNOWN: &str = "unknown"; /// Labels for traffic metrics. diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 66c41f65ff..1eacec9981 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -11,13 +11,8 @@ use std::{ io::{self, ErrorKind}, sync::Arc, }; -use tokio::{ - fs::{File, OpenOptions}, - io::AsyncWrite, - sync::mpsc, - task, -}; -use tokio_tar::{Archive, Builder}; +use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task}; +use tokio_tar::{Archive, Builder, Header}; use tokio_util::{ io::{CopyToBytes, SinkWriter}, sync::PollSender, @@ -32,13 +27,15 @@ use crate::{ routes::TimelineStatus, }, safekeeper::Term, - timeline::{get_tenant_dir, get_timeline_dir, FullAccessTimeline, Timeline, TimelineError}, + state::TimelinePersistentState, + timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError, WalResidentTimeline}, + wal_backup, wal_storage::{self, open_wal_file, Storage}, GlobalTimelines, SafeKeeperConf, }; use utils::{ crashsafe::{durable_rename, fsync_async_opt}, - id::{TenantId, TenantTimelineId, TimelineId}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, logging::SecretString, lsn::Lsn, pausable_failpoint, @@ -46,8 +43,13 @@ use utils::{ /// Stream tar archive of timeline to tx. #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))] -pub async fn stream_snapshot(tli: FullAccessTimeline, tx: mpsc::Sender>) { - if let Err(e) = stream_snapshot_guts(tli, tx.clone()).await { +pub async fn stream_snapshot( + tli: WalResidentTimeline, + source: NodeId, + destination: NodeId, + tx: mpsc::Sender>, +) { + if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await { // Error type/contents don't matter as they won't can't reach the client // (hyper likely doesn't do anything with it), but http stream will be // prematurely terminated. It would be nice to try to send the error in @@ -66,7 +68,7 @@ pub struct SnapshotContext { pub flush_lsn: Lsn, pub wal_seg_size: usize, // used to remove WAL hold off in Drop. - pub tli: FullAccessTimeline, + pub tli: WalResidentTimeline, } impl Drop for SnapshotContext { @@ -80,7 +82,9 @@ impl Drop for SnapshotContext { } pub async fn stream_snapshot_guts( - tli: FullAccessTimeline, + tli: WalResidentTimeline, + source: NodeId, + destination: NodeId, tx: mpsc::Sender>, ) -> Result<()> { // tokio-tar wants Write implementor, but we have mpsc tx >; @@ -104,7 +108,7 @@ pub async fn stream_snapshot_guts( // which is also likely suboptimal. let mut ar = Builder::new_non_terminated(pinned_writer); - let bctx = tli.start_snapshot(&mut ar).await?; + let bctx = tli.start_snapshot(&mut ar, source, destination).await?; pausable_failpoint!("sk-snapshot-after-list-pausable"); let tli_dir = tli.get_timeline_dir(); @@ -135,7 +139,7 @@ pub async fn stream_snapshot_guts( Ok(()) } -impl FullAccessTimeline { +impl WalResidentTimeline { /// Start streaming tar archive with timeline: /// 1) stream control file under lock; /// 2) hold off WAL removal; @@ -158,12 +162,43 @@ impl FullAccessTimeline { async fn start_snapshot( &self, ar: &mut tokio_tar::Builder, + source: NodeId, + destination: NodeId, ) -> Result { let mut shared_state = self.write_shared_state().await; + let wal_seg_size = shared_state.get_wal_seg_size(); - let cf_path = self.get_timeline_dir().join(CONTROL_FILE_NAME); - let mut cf = File::open(cf_path).await?; - ar.append_file(CONTROL_FILE_NAME, &mut cf).await?; + let mut control_store = TimelinePersistentState::clone(shared_state.sk.state()); + // Modify the partial segment of the in-memory copy for the control file to + // point to the destination safekeeper. + let replace = control_store + .partial_backup + .replace_uploaded_segment(source, destination)?; + + if let Some(replace) = replace { + // The deserialized control file has an uploaded partial. We upload a copy + // of it to object storage for the destination safekeeper and send an updated + // control file in the snapshot. + tracing::info!( + "Replacing uploaded partial segment in in-mem control file: {replace:?}" + ); + + let remote_timeline_path = wal_backup::remote_timeline_path(&self.tli.ttid)?; + wal_backup::copy_partial_segment( + &replace.previous.remote_path(&remote_timeline_path), + &replace.current.remote_path(&remote_timeline_path), + ) + .await?; + } + + let buf = control_store + .write_to_buf() + .with_context(|| "failed to serialize control store")?; + let mut header = Header::new_gnu(); + header.set_size(buf.len().try_into().expect("never breaches u64")); + ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice()) + .await + .with_context(|| "failed to append to archive")?; // We need to stream since the oldest segment someone (s3 or pageserver) // still needs. This duplicates calc_horizon_lsn logic. @@ -173,19 +208,19 @@ impl FullAccessTimeline { // lock and setting `wal_removal_on_hold` later, it guarantees that WAL // won't be removed until we're done. let from_lsn = min( - shared_state.sk.state.remote_consistent_lsn, - shared_state.sk.state.backup_lsn, + shared_state.sk.state().remote_consistent_lsn, + shared_state.sk.state().backup_lsn, ); if from_lsn == Lsn::INVALID { // this is possible if snapshot is called before handling first // elected message bail!("snapshot is called on uninitialized timeline"); } - let from_segno = from_lsn.segment_number(shared_state.get_wal_seg_size()); - let term = shared_state.sk.get_term(); - let last_log_term = shared_state.sk.get_last_log_term(); + let from_segno = from_lsn.segment_number(wal_seg_size); + let term = shared_state.sk.state().acceptor_state.term; + let last_log_term = shared_state.sk.last_log_term(); let flush_lsn = shared_state.sk.flush_lsn(); - let upto_segno = flush_lsn.segment_number(shared_state.get_wal_seg_size()); + let upto_segno = flush_lsn.segment_number(wal_seg_size); // have some limit on max number of segments as a sanity check const MAX_ALLOWED_SEGS: u64 = 1000; let num_segs = upto_segno - from_segno + 1; @@ -206,14 +241,18 @@ impl FullAccessTimeline { } shared_state.wal_removal_on_hold = true; + // Drop shared_state to release the lock, before calling wal_residence_guard(). + drop(shared_state); + + let tli_copy = self.wal_residence_guard().await?; let bctx = SnapshotContext { from_segno, upto_segno, term, last_log_term, flush_lsn, - wal_seg_size: shared_state.get_wal_seg_size(), - tli: self.clone(), + wal_seg_size, + tli: tli_copy, }; Ok(bctx) @@ -225,8 +264,8 @@ impl FullAccessTimeline { /// forget this if snapshotting fails mid the way. pub async fn finish_snapshot(&self, bctx: &SnapshotContext) -> Result<()> { let shared_state = self.read_shared_state().await; - let term = shared_state.sk.get_term(); - let last_log_term = shared_state.sk.get_last_log_term(); + let term = shared_state.sk.state().acceptor_state.term; + let last_log_term = shared_state.sk.last_log_term(); // There are some cases to relax this check (e.g. last_log_term might // change, but as long as older history is strictly part of new that's // fine), but there is no need to do it. @@ -337,7 +376,7 @@ async fn pull_timeline( let client = Client::new(host.clone(), sk_auth_token.clone()); // Request stream with basebackup archive. let bb_resp = client - .snapshot(status.tenant_id, status.timeline_id) + .snapshot(status.tenant_id, status.timeline_id, conf.my_id) .await?; // Make Stream of Bytes from it... diff --git a/safekeeper/src/rate_limit.rs b/safekeeper/src/rate_limit.rs new file mode 100644 index 0000000000..72373b5786 --- /dev/null +++ b/safekeeper/src/rate_limit.rs @@ -0,0 +1,49 @@ +use std::sync::Arc; + +use rand::Rng; + +use crate::metrics::MISC_OPERATION_SECONDS; + +/// Global rate limiter for background tasks. +#[derive(Clone)] +pub struct RateLimiter { + partial_backup: Arc, + eviction: Arc, +} + +impl RateLimiter { + /// Create a new rate limiter. + /// - `partial_backup_max`: maximum number of concurrent partial backups. + /// - `eviction_max`: maximum number of concurrent timeline evictions. + pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self { + Self { + partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)), + eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)), + } + } + + /// Get a permit for partial backup. This will block if the maximum number of concurrent + /// partial backups is reached. + pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["partial_permit_acquire"]) + .start_timer(); + self.partial_backup + .clone() + .acquire_owned() + .await + .expect("semaphore is closed") + } + + /// Try to get a permit for timeline eviction. This will return None if the maximum number of + /// concurrent timeline evictions is reached. + pub fn try_acquire_eviction(&self) -> Option { + self.eviction.clone().try_acquire_owned().ok() + } +} + +/// Generate a random duration that is a fraction of the given duration. +pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration { + let randf64 = rand::thread_rng().gen_range(0.0..1.0); + duration.mul_f64(randf64) +} diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 7943a2fd86..ab8c76dc17 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler; use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; use crate::safekeeper::ServerInfo; -use crate::timeline::FullAccessTimeline; +use crate::timeline::WalResidentTimeline; use crate::wal_service::ConnectionId; use crate::GlobalTimelines; use anyhow::{anyhow, Context}; @@ -213,7 +213,7 @@ impl SafekeeperPostgresHandler { &mut self, pgb: &mut PostgresBackend, ) -> Result<(), QueryError> { - let mut tli: Option = None; + let mut tli: Option = None; if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await { // Log the result and probably send it to the client, closing the stream. let handle_end_fut = pgb.handle_copy_stream_end(end); @@ -233,7 +233,7 @@ impl SafekeeperPostgresHandler { pub async fn handle_start_wal_push_guts( &mut self, pgb: &mut PostgresBackend, - tli: &mut Option, + tli: &mut Option, ) -> Result<(), CopyStreamHandlerEnd> { // Notify the libpq client that it's allowed to send `CopyData` messages pgb.write_message(&BeMessage::CopyBothResponse).await?; @@ -269,11 +269,11 @@ impl SafekeeperPostgresHandler { .get_walreceivers() .pageserver_feedback_tx .subscribe(); - *tli = Some(timeline.clone()); + *tli = Some(timeline.wal_residence_guard().await?); tokio::select! { // todo: add read|write .context to these errors - r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r, + r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r, r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, } } else { @@ -323,7 +323,7 @@ struct NetworkReader<'a, IO> { impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { async fn read_first_message( &mut self, - ) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> { + ) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> { // Receive information about server to create timeline, if not yet. let next_msg = read_message(self.pgb_reader).await?; let tli = match next_msg { @@ -340,7 +340,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { let tli = GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID) .await?; - tli.full_access_guard().await? + tli.wal_residence_guard().await? } _ => { return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( @@ -356,7 +356,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { msg_tx: Sender, msg_rx: Receiver, reply_tx: Sender, - tli: FullAccessTimeline, + tli: WalResidentTimeline, next_msg: ProposerAcceptorMessage, ) -> Result<(), CopyStreamHandlerEnd> { *self.acceptor_handle = Some(WalAcceptor::spawn( @@ -451,7 +451,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1); /// replies to reply_tx; reading from socket and writing to disk in parallel is /// beneficial for performance, this struct provides writing to disk part. pub struct WalAcceptor { - tli: FullAccessTimeline, + tli: WalResidentTimeline, msg_rx: Receiver, reply_tx: Sender, conn_id: Option, @@ -464,7 +464,7 @@ impl WalAcceptor { /// /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper. pub fn spawn( - tli: FullAccessTimeline, + tli: WalResidentTimeline, msg_rx: Receiver, reply_tx: Sender, conn_id: Option, diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index 80a630b1e1..a59ff07b96 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -21,7 +21,7 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config} use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE}; use crate::safekeeper::{AppendRequest, AppendRequestHeader}; -use crate::timeline::FullAccessTimeline; +use crate::timeline::WalResidentTimeline; use crate::{ http::routes::TimelineStatus, receive_wal::MSG_QUEUE_SIZE, @@ -36,7 +36,7 @@ use crate::{ /// Entrypoint for per timeline task which always runs, checking whether /// recovery for this safekeeper is needed and starting it if so. #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))] -pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) { +pub async fn recovery_main(tli: WalResidentTimeline, conf: SafeKeeperConf) { info!("started"); let cancel = tli.cancel.clone(); @@ -66,12 +66,12 @@ pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) { /// depending on assembled quorum (e.g. classic picture 8 from Raft paper). /// Thus we don't try to predict it here. async fn recovery_needed( - tli: &FullAccessTimeline, + tli: &WalResidentTimeline, heartbeat_timeout: Duration, ) -> RecoveryNeededInfo { let ss = tli.read_shared_state().await; - let term = ss.sk.state.acceptor_state.term; - let last_log_term = ss.sk.get_last_log_term(); + let term = ss.sk.state().acceptor_state.term; + let last_log_term = ss.sk.last_log_term(); let flush_lsn = ss.sk.flush_lsn(); // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us. let mut peers = ss.get_peers(heartbeat_timeout); @@ -195,7 +195,7 @@ impl From<&PeerInfo> for Donor { const CHECK_INTERVAL_MS: u64 = 2000; /// Check regularly whether we need to start recovery. -async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) { +async fn recovery_main_loop(tli: WalResidentTimeline, conf: SafeKeeperConf) { let check_duration = Duration::from_millis(CHECK_INTERVAL_MS); loop { let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await; @@ -205,7 +205,12 @@ async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) { "starting recovery from donor {}: {}", donor.sk_id, recovery_needed_info ); - match recover(tli.clone(), donor, &conf).await { + let res = tli.wal_residence_guard().await; + if let Err(e) = res { + warn!("failed to obtain guard: {}", e); + continue; + } + match recover(res.unwrap(), donor, &conf).await { // Note: 'write_wal rewrites WAL written before' error is // expected here and might happen if compute and recovery // concurrently write the same data. Eventually compute @@ -228,7 +233,7 @@ async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) { /// Recover from the specified donor. Returns message explaining normal finish /// reason or error. async fn recover( - tli: FullAccessTimeline, + tli: WalResidentTimeline, donor: &Donor, conf: &SafeKeeperConf, ) -> anyhow::Result { @@ -314,7 +319,7 @@ async fn recover( // Pull WAL from donor, assuming handshake is already done. async fn recovery_stream( - tli: FullAccessTimeline, + tli: WalResidentTimeline, donor: &Donor, start_streaming_at: Lsn, conf: &SafeKeeperConf, @@ -364,10 +369,10 @@ async fn recovery_stream( // As in normal walreceiver, do networking and writing to disk in parallel. let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE); let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE); - let wa = WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, None); + let wa = WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, None); let res = tokio::select! { - r = network_io(physical_stream, msg_tx, donor.clone(), tli.clone(), conf.clone()) => r, + r = network_io(physical_stream, msg_tx, donor.clone(), tli, conf.clone()) => r, r = read_replies(reply_rx, donor.term) => r.map(|()| None), }; @@ -398,7 +403,7 @@ async fn network_io( physical_stream: ReplicationStream, msg_tx: Sender, donor: Donor, - tli: FullAccessTimeline, + tli: WalResidentTimeline, conf: SafeKeeperConf, ) -> anyhow::Result> { let mut physical_stream = pin!(physical_stream); diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index b661e48cb5..16239d847b 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -8,7 +8,7 @@ use crate::timeline_manager::StateSnapshot; /// While it is safe to use inmem values for determining horizon, /// we use persistent to make possible normal states less surprising. /// All segments covering LSNs before horizon_lsn can be removed. -pub fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option) -> Lsn { +pub(crate) fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option) -> Lsn { use std::cmp::min; let mut horizon_lsn = min( diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index ae230960ae..0814d9ba67 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -15,6 +15,7 @@ use storage_broker::proto::SafekeeperTimelineInfo; use tracing::*; use crate::control_file; +use crate::metrics::MISC_OPERATION_SECONDS; use crate::send_wal::HotStandbyFeedback; use crate::state::TimelineState; @@ -91,7 +92,7 @@ impl TermHistory { } /// Find point of divergence between leader (walproposer) term history and - /// safekeeper. Arguments are not symmetrics as proposer history ends at + /// safekeeper. Arguments are not symmetric as proposer history ends at /// +infinity while safekeeper at flush_lsn. /// C version is at walproposer SendProposerElected. pub fn find_highest_common_point( @@ -499,7 +500,11 @@ where /// Accepts a control file storage containing the safekeeper state. /// State must be initialized, i.e. contain filled `tenant_id`, `timeline_id` /// and `server` (`wal_seg_size` inside it) fields. - pub fn new(state: CTRL, wal_store: WAL, node_id: NodeId) -> Result> { + pub fn new( + state: TimelineState, + wal_store: WAL, + node_id: NodeId, + ) -> Result> { if state.tenant_id == TenantId::from([0u8; 16]) || state.timeline_id == TimelineId::from([0u8; 16]) { @@ -512,7 +517,7 @@ where Ok(SafeKeeper { term_start_lsn: Lsn(0), - state: TimelineState::new(state), + state, wal_store, node_id, }) @@ -526,11 +531,6 @@ where .up_to(self.flush_lsn()) } - /// Get current term. - pub fn get_term(&self) -> Term { - self.state.acceptor_state.term - } - pub fn get_last_log_term(&self) -> Term { self.state .acceptor_state @@ -697,7 +697,17 @@ where &mut self, msg: &ProposerElected, ) -> Result> { - info!("received ProposerElected {:?}", msg); + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["handle_elected"]) + .start_timer(); + + info!( + "received ProposerElected {:?}, term={}, last_log_term={}, flush_lsn={}", + msg, + self.state.acceptor_state.term, + self.get_last_log_term(), + self.flush_lsn() + ); if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); state.acceptor_state.term = msg.term; @@ -709,22 +719,43 @@ where return Ok(None); } - // This might happen in a rare race when another (old) connection from - // the same walproposer writes + flushes WAL after this connection - // already sent flush_lsn in VoteRequest. It is generally safe to - // proceed, but to prevent commit_lsn surprisingly going down we should - // either refuse the session (simpler) or skip the part we already have - // from the stream (can be implemented). - if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at { - bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help", - msg.term, self.flush_lsn(), msg.start_streaming_at) + // Before truncating WAL check-cross the check divergence point received + // from the walproposer. + let sk_th = self.get_term_history(); + let last_common_point = match TermHistory::find_highest_common_point( + &msg.term_history, + &sk_th, + self.flush_lsn(), + ) { + // No common point. Expect streaming from the beginning of the + // history like walproposer while we don't have proper init. + None => *msg.term_history.0.first().ok_or(anyhow::anyhow!( + "empty walproposer term history {:?}", + msg.term_history + ))?, + Some(lcp) => lcp, + }; + // This is expected to happen in a rare race when another connection + // from the same walproposer writes + flushes WAL after this connection + // sent flush_lsn in VoteRequest; for instance, very late + // ProposerElected message delivery after another connection was + // established and wrote WAL. In such cases error is transient; + // reconnection makes safekeeper send newest term history and flush_lsn + // and walproposer recalculates the streaming point. OTOH repeating + // error indicates a serious bug. + if last_common_point.lsn != msg.start_streaming_at { + bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", + last_common_point, msg.start_streaming_at, + self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, + ); } - // Otherwise we must never attempt to truncate committed data. + + // We are also expected to never attempt to truncate committed data. assert!( msg.start_streaming_at >= self.state.inmem.commit_lsn, - "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}", - msg.start_streaming_at, - self.state.inmem.commit_lsn + "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", + msg.start_streaming_at, self.state.inmem.commit_lsn, + self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, ); // Before first WAL write initialize its segment. It makes first segment @@ -739,9 +770,6 @@ where .await?; } - // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to - // intersection of our history and history from msg - // truncate wal, update the LSNs self.wal_store.truncate_wal(msg.start_streaming_at).await?; @@ -912,10 +940,8 @@ where ))) } - /// Update timeline state with peer safekeeper data. + /// Update commit_lsn from peer safekeeper data. pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { - let mut sync_control_file = false; - if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) { // Note: the check is too restrictive, generally we can update local // commit_lsn if our history matches (is part of) history of advanced @@ -924,29 +950,6 @@ where self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?; } } - - self.state.inmem.backup_lsn = max(Lsn(sk_info.backup_lsn), self.state.inmem.backup_lsn); - sync_control_file |= self.state.backup_lsn + (self.state.server.wal_seg_size as u64) - < self.state.inmem.backup_lsn; - - self.state.inmem.remote_consistent_lsn = max( - Lsn(sk_info.remote_consistent_lsn), - self.state.inmem.remote_consistent_lsn, - ); - sync_control_file |= self.state.remote_consistent_lsn - + (self.state.server.wal_seg_size as u64) - < self.state.inmem.remote_consistent_lsn; - - self.state.inmem.peer_horizon_lsn = max( - Lsn(sk_info.peer_horizon_lsn), - self.state.inmem.peer_horizon_lsn, - ); - sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) - < self.state.inmem.peer_horizon_lsn; - - if sync_control_file { - self.state.flush().await?; - } Ok(()) } } @@ -958,7 +961,7 @@ mod tests { use super::*; use crate::{ - state::{PersistedPeers, TimelinePersistentState}, + state::{EvictionState, PersistedPeers, TimelinePersistentState}, wal_storage::Storage, }; use std::{ops::Deref, str::FromStr, time::Instant}; @@ -1039,7 +1042,7 @@ mod tests { persisted_state: test_sk_state(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -1055,7 +1058,7 @@ mod tests { persisted_state: state, }; - sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap(); + sk = SafeKeeper::new(TimelineState::new(storage), sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request).await; @@ -1072,7 +1075,7 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, @@ -1090,7 +1093,7 @@ mod tests { let pem = ProposerElected { term: 1, - start_streaming_at: Lsn(1), + start_streaming_at: Lsn(3), term_history: TermHistory(vec![TermLsn { term: 1, lsn: Lsn(3), @@ -1225,6 +1228,7 @@ mod tests { }, )]), partial_backup: crate::wal_backup_partial::State::default(), + eviction_state: EvictionState::Present, }; let ser = state.ser().unwrap(); @@ -1272,6 +1276,8 @@ mod tests { 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, // partial_backup 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + // eviction_state + 0x00, 0x00, 0x00, 0x00, ]; assert_eq!(Hex(&ser), Hex(&expected)); diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index df75893838..90b1604adb 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler; use crate::metrics::RECEIVED_PS_FEEDBACKS; use crate::receive_wal::WalReceivers; use crate::safekeeper::{Term, TermLsn}; -use crate::timeline::FullAccessTimeline; +use crate::timeline::WalResidentTimeline; use crate::wal_service::ConnectionId; use crate::wal_storage::WalReader; use crate::GlobalTimelines; @@ -387,10 +387,10 @@ impl SafekeeperPostgresHandler { term: Option, ) -> Result<(), QueryError> { let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?; - let full_access = tli.full_access_guard().await?; + let residence_guard = tli.wal_residence_guard().await?; if let Err(end) = self - .handle_start_replication_guts(pgb, start_pos, term, full_access) + .handle_start_replication_guts(pgb, start_pos, term, residence_guard) .await { let info = tli.get_safekeeper_info(&self.conf).await; @@ -407,7 +407,7 @@ impl SafekeeperPostgresHandler { pgb: &mut PostgresBackend, start_pos: Lsn, term: Option, - tli: FullAccessTimeline, + tli: WalResidentTimeline, ) -> Result<(), CopyStreamHandlerEnd> { let appname = self.appname.clone(); @@ -458,7 +458,8 @@ impl SafekeeperPostgresHandler { let mut sender = WalSender { pgb, - tli: tli.clone(), + // should succeed since we're already holding another guard + tli: tli.wal_residence_guard().await?, appname, start_pos, end_pos, @@ -527,7 +528,7 @@ impl EndWatch { /// A half driving sending WAL. struct WalSender<'a, IO> { pgb: &'a mut PostgresBackend, - tli: FullAccessTimeline, + tli: WalResidentTimeline, appname: Option, // Position since which we are sending next chunk. start_pos: Lsn, @@ -736,7 +737,7 @@ impl WalSender<'_, IO> { struct ReplyReader { reader: PostgresBackendReader, ws_guard: Arc, - tli: FullAccessTimeline, + tli: WalResidentTimeline, } impl ReplyReader { diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index be5e516296..dca6414082 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -63,11 +63,26 @@ pub struct TimelinePersistentState { /// Holds names of partial segments uploaded to remote storage. Used to /// clean up old objects without leaving garbage in remote storage. pub partial_backup: wal_backup_partial::State, + /// Eviction state of the timeline. If it's Offloaded, we should download + /// WAL files from remote storage to serve the timeline. + pub eviction_state: EvictionState, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); +/// State of the local WAL files. Used to track current timeline state, +/// that can be either WAL files are present on disk or last partial segment +/// is offloaded to remote storage. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)] +pub enum EvictionState { + /// WAL files are present on disk. + Present, + /// Last partial segment is offloaded to remote storage. + /// Contains flush_lsn of the last offloaded segment. + Offloaded(Lsn), +} + impl TimelinePersistentState { pub fn new( ttid: &TenantTimelineId, @@ -98,6 +113,7 @@ impl TimelinePersistentState { .collect(), ), partial_backup: wal_backup_partial::State::default(), + eviction_state: EvictionState::Present, } } @@ -173,7 +189,12 @@ where /// Persist given state. c.f. start_change. pub async fn finish_change(&mut self, s: &TimelinePersistentState) -> Result<()> { - self.pers.persist(s).await?; + if s.eq(&*self.pers) { + // nothing to do if state didn't change + } else { + self.pers.persist(s).await?; + } + // keep in memory values up to date self.inmem.commit_lsn = s.commit_lsn; self.inmem.backup_lsn = s.backup_lsn; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 544ffdbb36..57935d879f 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -25,18 +25,22 @@ use utils::{ use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; +use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn, INVALID_TERM, }; use crate::send_wal::WalSenders; -use crate::state::{TimelineMemState, TimelinePersistentState}; +use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState}; +use crate::timeline_guard::ResidenceGuard; +use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::{self}; +use crate::wal_backup_partial::PartialRemoteSegment; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; -use crate::metrics::FullTimelineInfo; +use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS}; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; use crate::{debug_dump, timeline_manager, wal_storage}; use crate::{GlobalTimelines, SafeKeeperConf}; @@ -132,8 +136,9 @@ impl<'a> DerefMut for WriteGuardSharedState<'a> { impl<'a> Drop for WriteGuardSharedState<'a> { fn drop(&mut self) { - let term_flush_lsn = TermLsn::from((self.guard.sk.get_term(), self.guard.sk.flush_lsn())); - let commit_lsn = self.guard.sk.state.inmem.commit_lsn; + let term_flush_lsn = + TermLsn::from((self.guard.sk.last_log_term(), self.guard.sk.flush_lsn())); + let commit_lsn = self.guard.sk.state().inmem.commit_lsn; let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| { if *old != term_flush_lsn { @@ -162,10 +167,150 @@ impl<'a> Drop for WriteGuardSharedState<'a> { } } +/// This structure is stored in shared state and represents the state of the timeline. +/// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this +/// case, SafeKeeper is not available (because WAL is not present on disk) and all +/// operations can be done only with control file. +pub enum StateSK { + Loaded(SafeKeeper), + Offloaded(Box>), + // Not used, required for moving between states. + Empty, +} + +impl StateSK { + pub fn flush_lsn(&self) -> Lsn { + match self { + StateSK::Loaded(sk) => sk.wal_store.flush_lsn(), + StateSK::Offloaded(state) => match state.eviction_state { + EvictionState::Offloaded(flush_lsn) => flush_lsn, + _ => panic!("StateSK::Offloaded mismatches with eviction_state from control_file"), + }, + StateSK::Empty => unreachable!(), + } + } + + /// Get a reference to the control file's timeline state. + pub fn state(&self) -> &TimelineState { + match self { + StateSK::Loaded(sk) => &sk.state, + StateSK::Offloaded(ref s) => s, + StateSK::Empty => unreachable!(), + } + } + + pub fn state_mut(&mut self) -> &mut TimelineState { + match self { + StateSK::Loaded(sk) => &mut sk.state, + StateSK::Offloaded(ref mut s) => s, + StateSK::Empty => unreachable!(), + } + } + + pub fn last_log_term(&self) -> Term { + self.state() + .acceptor_state + .get_last_log_term(self.flush_lsn()) + } + + /// Close open WAL files to release FDs. + fn close_wal_store(&mut self) { + if let StateSK::Loaded(sk) = self { + sk.wal_store.close(); + } + } + + /// Update timeline state with peer safekeeper data. + pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { + // update commit_lsn if safekeeper is loaded + match self { + StateSK::Loaded(sk) => sk.record_safekeeper_info(sk_info).await?, + StateSK::Offloaded(_) => {} + StateSK::Empty => unreachable!(), + } + + // update everything else, including remote_consistent_lsn and backup_lsn + let mut sync_control_file = false; + let state = self.state_mut(); + let wal_seg_size = state.server.wal_seg_size as u64; + + state.inmem.backup_lsn = max(Lsn(sk_info.backup_lsn), state.inmem.backup_lsn); + sync_control_file |= state.backup_lsn + wal_seg_size < state.inmem.backup_lsn; + + state.inmem.remote_consistent_lsn = max( + Lsn(sk_info.remote_consistent_lsn), + state.inmem.remote_consistent_lsn, + ); + sync_control_file |= + state.remote_consistent_lsn + wal_seg_size < state.inmem.remote_consistent_lsn; + + state.inmem.peer_horizon_lsn = + max(Lsn(sk_info.peer_horizon_lsn), state.inmem.peer_horizon_lsn); + sync_control_file |= state.peer_horizon_lsn + wal_seg_size < state.inmem.peer_horizon_lsn; + + if sync_control_file { + state.flush().await?; + } + Ok(()) + } + + /// Previously known as epoch_start_lsn. Needed only for reference in some APIs. + pub fn term_start_lsn(&self) -> Lsn { + match self { + StateSK::Loaded(sk) => sk.term_start_lsn, + StateSK::Offloaded(_) => Lsn(0), + StateSK::Empty => unreachable!(), + } + } + + /// Used for metrics only. + pub fn wal_storage_metrics(&self) -> WalStorageMetrics { + match self { + StateSK::Loaded(sk) => sk.wal_store.get_metrics(), + StateSK::Offloaded(_) => WalStorageMetrics::default(), + StateSK::Empty => unreachable!(), + } + } + + /// Returns WAL storage internal LSNs for debug dump. + pub fn wal_storage_internal_state(&self) -> (Lsn, Lsn, Lsn, bool) { + match self { + StateSK::Loaded(sk) => sk.wal_store.internal_state(), + StateSK::Offloaded(_) => { + let flush_lsn = self.flush_lsn(); + (flush_lsn, flush_lsn, flush_lsn, false) + } + StateSK::Empty => unreachable!(), + } + } + + /// Access to SafeKeeper object. Panics if offloaded, should be good to use from WalResidentTimeline. + pub fn safekeeper( + &mut self, + ) -> &mut SafeKeeper { + match self { + StateSK::Loaded(sk) => sk, + StateSK::Offloaded(_) => { + panic!("safekeeper is offloaded, cannot be used") + } + StateSK::Empty => unreachable!(), + } + } + + /// Moves control file's state structure out of the enum. Used to switch states. + fn take_state(self) -> TimelineState { + match self { + StateSK::Loaded(sk) => sk.state, + StateSK::Offloaded(state) => *state, + StateSK::Empty => unreachable!(), + } + } +} + /// Shared state associated with database instance pub struct SharedState { /// Safekeeper object - pub(crate) sk: SafeKeeper, + pub(crate) sk: StateSK, /// In memory list containing state of peers sent in latest messages from them. pub(crate) peers_info: PeersInfo, // True value hinders old WAL removal; this is used by snapshotting. We @@ -203,10 +348,10 @@ impl SharedState { control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?; let wal_store = wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?; - let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; + let sk = SafeKeeper::new(TimelineState::new(control_store), wal_store, conf.my_id)?; Ok(Self { - sk, + sk: StateSK::Loaded(sk), peers_info: PeersInfo(vec![]), wal_removal_on_hold: false, }) @@ -220,18 +365,30 @@ impl SharedState { bail!(TimelineError::UninitializedWalSegSize(*ttid)); } - let wal_store = - wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?; + let sk = match control_store.eviction_state { + EvictionState::Present => { + let wal_store = + wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?; + StateSK::Loaded(SafeKeeper::new( + TimelineState::new(control_store), + wal_store, + conf.my_id, + )?) + } + EvictionState::Offloaded(_) => { + StateSK::Offloaded(Box::new(TimelineState::new(control_store))) + } + }; Ok(Self { - sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, + sk, peers_info: PeersInfo(vec![]), wal_removal_on_hold: false, }) } pub(crate) fn get_wal_seg_size(&self) -> usize { - self.sk.state.server.wal_seg_size as usize + self.sk.state().server.wal_seg_size as usize } fn get_safekeeper_info( @@ -246,20 +403,20 @@ impl SharedState { tenant_id: ttid.tenant_id.as_ref().to_owned(), timeline_id: ttid.timeline_id.as_ref().to_owned(), }), - term: self.sk.state.acceptor_state.term, - last_log_term: self.sk.get_last_log_term(), + term: self.sk.state().acceptor_state.term, + last_log_term: self.sk.last_log_term(), flush_lsn: self.sk.flush_lsn().0, // note: this value is not flushed to control file yet and can be lost - commit_lsn: self.sk.state.inmem.commit_lsn.0, - remote_consistent_lsn: self.sk.state.inmem.remote_consistent_lsn.0, - peer_horizon_lsn: self.sk.state.inmem.peer_horizon_lsn.0, + commit_lsn: self.sk.state().inmem.commit_lsn.0, + remote_consistent_lsn: self.sk.state().inmem.remote_consistent_lsn.0, + peer_horizon_lsn: self.sk.state().inmem.peer_horizon_lsn.0, safekeeper_connstr: conf .advertise_pg_addr .to_owned() .unwrap_or(conf.listen_pg_addr.clone()), http_connstr: conf.listen_http_addr.to_owned(), - backup_lsn: self.sk.state.inmem.backup_lsn.0, - local_start_lsn: self.sk.state.local_start_lsn.0, + backup_lsn: self.sk.state().inmem.backup_lsn.0, + local_start_lsn: self.sk.state().local_start_lsn.0, availability_zone: conf.availability_zone.clone(), standby_horizon: standby_apply_lsn.0, } @@ -335,6 +492,7 @@ pub struct Timeline { walsenders: Arc, walreceivers: Arc, timeline_dir: Utf8PathBuf, + manager_ctl: ManagerCtl, /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires pub(crate) cancel: CancellationToken, @@ -343,6 +501,7 @@ pub struct Timeline { pub(crate) broker_active: AtomicBool, pub(crate) wal_backup_active: AtomicBool, pub(crate) last_removed_segno: AtomicU64, + pub(crate) mgr_status: AtomicStatus, } impl Timeline { @@ -352,9 +511,9 @@ impl Timeline { let shared_state = SharedState::restore(conf, &ttid)?; let (commit_lsn_watch_tx, commit_lsn_watch_rx) = - watch::channel(shared_state.sk.state.commit_lsn); + watch::channel(shared_state.sk.state().commit_lsn); let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from(( - shared_state.sk.get_term(), + shared_state.sk.last_log_term(), shared_state.sk.flush_lsn(), ))); let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); @@ -373,9 +532,11 @@ impl Timeline { walreceivers, cancel: CancellationToken::default(), timeline_dir: get_timeline_dir(conf, &ttid), + manager_ctl: ManagerCtl::new(), broker_active: AtomicBool::new(false), wal_backup_active: AtomicBool::new(false), last_removed_segno: AtomicU64::new(0), + mgr_status: AtomicStatus::new(), }) } @@ -409,9 +570,11 @@ impl Timeline { walreceivers, cancel: CancellationToken::default(), timeline_dir: get_timeline_dir(conf, &ttid), + manager_ctl: ManagerCtl::new(), broker_active: AtomicBool::new(false), wal_backup_active: AtomicBool::new(false), last_removed_segno: AtomicU64::new(0), + mgr_status: AtomicStatus::new(), }) } @@ -425,6 +588,7 @@ impl Timeline { shared_state: &mut WriteGuardSharedState<'_>, conf: &SafeKeeperConf, broker_active_set: Arc, + partial_backup_rate_limiter: RateLimiter, ) -> Result<()> { match fs::metadata(&self.timeline_dir).await { Ok(_) => { @@ -442,7 +606,7 @@ impl Timeline { fs::create_dir_all(&self.timeline_dir).await?; // Write timeline to disk and start background tasks. - if let Err(e) = shared_state.sk.state.flush().await { + if let Err(e) = shared_state.sk.state_mut().flush().await { // Bootstrap failed, cancel timeline and remove timeline directory. self.cancel(shared_state); @@ -455,7 +619,7 @@ impl Timeline { return Err(e); } - self.bootstrap(conf, broker_active_set); + self.bootstrap(conf, broker_active_set, partial_backup_rate_limiter); Ok(()) } @@ -464,13 +628,19 @@ impl Timeline { self: &Arc, conf: &SafeKeeperConf, broker_active_set: Arc, + partial_backup_rate_limiter: RateLimiter, ) { + let (tx, rx) = self.manager_ctl.bootstrap_manager(); + // Start manager task which will monitor timeline state and update // background tasks. tokio::spawn(timeline_manager::main_task( - self.clone(), + ManagerTimeline { tli: self.clone() }, conf.clone(), broker_active_set, + tx, + rx, + partial_backup_rate_limiter, )); } @@ -507,7 +677,7 @@ impl Timeline { self.cancel.cancel(); // Close associated FDs. Nobody will be able to touch timeline data once // it is cancelled, so WAL storage won't be opened again. - shared_state.sk.wal_store.close(); + shared_state.sk.close_wal_store(); } /// Returns if timeline is cancelled. @@ -547,12 +717,15 @@ impl Timeline { /// Returns state of the timeline. pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) { let state = self.read_shared_state().await; - (state.sk.state.inmem.clone(), state.sk.state.clone()) + ( + state.sk.state().inmem.clone(), + TimelinePersistentState::clone(state.sk.state()), + ) } /// Returns latest backup_lsn. pub async fn get_wal_backup_lsn(&self) -> Lsn { - self.read_shared_state().await.sk.state.inmem.backup_lsn + self.read_shared_state().await.sk.state().inmem.backup_lsn } /// Sets backup_lsn to the given value. @@ -562,7 +735,7 @@ impl Timeline { } let mut state = self.write_shared_state().await; - state.sk.state.inmem.backup_lsn = max(state.sk.state.inmem.backup_lsn, backup_lsn); + state.sk.state_mut().inmem.backup_lsn = max(state.sk.state().inmem.backup_lsn, backup_lsn); // we should check whether to shut down offloader, but this will be done // soon by peer communication anyway. Ok(()) @@ -604,7 +777,7 @@ impl Timeline { /// Returns flush_lsn. pub async fn get_flush_lsn(&self) -> Lsn { - self.read_shared_state().await.sk.wal_store.flush_lsn() + self.read_shared_state().await.sk.flush_lsn() } /// Gather timeline data for metrics. @@ -623,11 +796,11 @@ impl Timeline { timeline_is_active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed), - epoch_start_lsn: state.sk.term_start_lsn, - mem_state: state.sk.state.inmem.clone(), - persisted_state: state.sk.state.clone(), - flush_lsn: state.sk.wal_store.flush_lsn(), - wal_storage: state.sk.wal_store.get_metrics(), + epoch_start_lsn: state.sk.term_start_lsn(), + mem_state: state.sk.state().inmem.clone(), + persisted_state: TimelinePersistentState::clone(state.sk.state()), + flush_lsn: state.sk.flush_lsn(), + wal_storage: state.sk.wal_storage_metrics(), }) } @@ -636,7 +809,7 @@ impl Timeline { let state = self.read_shared_state().await; let (write_lsn, write_record_lsn, flush_lsn, file_open) = - state.sk.wal_store.internal_state(); + state.sk.wal_storage_internal_state(); debug_dump::Memory { is_cancelled: self.is_cancelled(), @@ -646,8 +819,9 @@ impl Timeline { active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed), - epoch_start_lsn: state.sk.term_start_lsn, - mem_state: state.sk.state.inmem.clone(), + epoch_start_lsn: state.sk.term_start_lsn(), + mem_state: state.sk.state().inmem.clone(), + mgr_status: self.mgr_status.get(), write_lsn, write_record_lsn, flush_lsn, @@ -661,34 +835,89 @@ impl Timeline { f: impl FnOnce(&mut TimelinePersistentState) -> Result, ) -> Result { let mut state = self.write_shared_state().await; - let mut persistent_state = state.sk.state.start_change(); + let mut persistent_state = state.sk.state_mut().start_change(); // If f returns error, we abort the change and don't persist anything. let res = f(&mut persistent_state)?; // If persisting fails, we abort the change and return error. - state.sk.state.finish_change(&persistent_state).await?; + state + .sk + .state_mut() + .finish_change(&persistent_state) + .await?; Ok(res) } /// Get the timeline guard for reading/writing WAL files. - /// TODO: if WAL files are not present on disk (evicted), they will be - /// downloaded from S3. Also there will logic for preventing eviction - /// while someone is holding FullAccessTimeline guard. - pub async fn full_access_guard(self: &Arc) -> Result { + /// If WAL files are not present on disk (evicted), they will be automatically + /// downloaded from remote storage. This is done in the manager task, which is + /// responsible for issuing all guards. + /// + /// NB: don't use this function from timeline_manager, it will deadlock. + /// NB: don't use this function while holding shared_state lock. + pub async fn wal_residence_guard(self: &Arc) -> Result { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } - Ok(FullAccessTimeline { tli: self.clone() }) + + debug!("requesting WalResidentTimeline guard"); + let started_at = Instant::now(); + let status_before = self.mgr_status.get(); + + // Wait 30 seconds for the guard to be acquired. It can time out if someone is + // holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task + // is stuck. + let res = tokio::time::timeout_at( + started_at + Duration::from_secs(30), + self.manager_ctl.wal_residence_guard(), + ) + .await; + + let guard = match res { + Ok(Ok(guard)) => { + let finished_at = Instant::now(); + let elapsed = finished_at - started_at; + MISC_OPERATION_SECONDS + .with_label_values(&["wal_residence_guard"]) + .observe(elapsed.as_secs_f64()); + + guard + } + Ok(Err(e)) => { + warn!( + "error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + status_before, + self.mgr_status.get() + ); + return Err(e); + } + Err(_) => { + warn!( + "timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + status_before, + self.mgr_status.get() + ); + anyhow::bail!("timeout while acquiring WalResidentTimeline guard"); + } + }; + + Ok(WalResidentTimeline::new(self.clone(), guard)) } } /// This is a guard that allows to read/write disk timeline state. -/// All tasks that are using the disk should use this guard. -#[derive(Clone)] -pub struct FullAccessTimeline { +/// All tasks that are trying to read/write WAL from disk should use this guard. +pub struct WalResidentTimeline { pub tli: Arc, + _guard: ResidenceGuard, } -impl Deref for FullAccessTimeline { +impl WalResidentTimeline { + pub fn new(tli: Arc, _guard: ResidenceGuard) -> Self { + WalResidentTimeline { tli, _guard } + } +} + +impl Deref for WalResidentTimeline { type Target = Arc; fn deref(&self) -> &Self::Target { @@ -696,7 +925,7 @@ impl Deref for FullAccessTimeline { } } -impl FullAccessTimeline { +impl WalResidentTimeline { /// Returns true if walsender should stop sending WAL to pageserver. We /// terminate it if remote_consistent_lsn reached commit_lsn and there is no /// computes. While there might be nothing to stream already, we learn about @@ -708,8 +937,8 @@ impl FullAccessTimeline { } let shared_state = self.read_shared_state().await; if self.walreceivers.get_num() == 0 { - return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet - reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn; + return shared_state.sk.state().inmem.commit_lsn == Lsn(0) || // no data at all yet + reported_remote_consistent_lsn >= shared_state.sk.state().inmem.commit_lsn; } false } @@ -717,11 +946,11 @@ impl FullAccessTimeline { /// Ensure that current term is t, erroring otherwise, and lock the state. pub async fn acquire_term(&self, t: Term) -> Result { let ss = self.read_shared_state().await; - if ss.sk.state.acceptor_state.term != t { + if ss.sk.state().acceptor_state.term != t { bail!( "failed to acquire term {}, current term {}", t, - ss.sk.state.acceptor_state.term + ss.sk.state().acceptor_state.term ); } Ok(ss) @@ -739,7 +968,7 @@ impl FullAccessTimeline { let mut rmsg: Option; { let mut shared_state = self.write_shared_state().await; - rmsg = shared_state.sk.process_msg(msg).await?; + rmsg = shared_state.sk.safekeeper().process_msg(msg).await?; // if this is AppendResponse, fill in proper hot standby feedback. if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { @@ -769,8 +998,141 @@ impl FullAccessTimeline { /// Update in memory remote consistent lsn. pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) { let mut shared_state = self.write_shared_state().await; - shared_state.sk.state.inmem.remote_consistent_lsn = - max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate); + shared_state.sk.state_mut().inmem.remote_consistent_lsn = max( + shared_state.sk.state().inmem.remote_consistent_lsn, + candidate, + ); + } +} + +/// This struct contains methods that are used by timeline manager task. +pub(crate) struct ManagerTimeline { + pub(crate) tli: Arc, +} + +impl Deref for ManagerTimeline { + type Target = Arc; + + fn deref(&self) -> &Self::Target { + &self.tli + } +} + +impl ManagerTimeline { + pub(crate) fn timeline_dir(&self) -> &Utf8PathBuf { + &self.tli.timeline_dir + } + + /// Manager requests this state on startup. + pub(crate) async fn bootstrap_mgr(&self) -> (bool, Option) { + let shared_state = self.read_shared_state().await; + let is_offloaded = matches!( + shared_state.sk.state().eviction_state, + EvictionState::Offloaded(_) + ); + let partial_backup_uploaded = shared_state.sk.state().partial_backup.uploaded_segment(); + + (is_offloaded, partial_backup_uploaded) + } + + /// Try to switch state Present->Offloaded. + pub(crate) async fn switch_to_offloaded( + &self, + partial: &PartialRemoteSegment, + ) -> anyhow::Result<()> { + let mut shared = self.write_shared_state().await; + + // updating control file + let mut pstate = shared.sk.state_mut().start_change(); + + if !matches!(pstate.eviction_state, EvictionState::Present) { + bail!( + "cannot switch to offloaded state, current state is {:?}", + pstate.eviction_state + ); + } + + if partial.flush_lsn != shared.sk.flush_lsn() { + bail!( + "flush_lsn mismatch in partial backup, expected {}, got {}", + shared.sk.flush_lsn(), + partial.flush_lsn + ); + } + + if partial.commit_lsn != pstate.commit_lsn { + bail!( + "commit_lsn mismatch in partial backup, expected {}, got {}", + pstate.commit_lsn, + partial.commit_lsn + ); + } + + if partial.term != shared.sk.last_log_term() { + bail!( + "term mismatch in partial backup, expected {}, got {}", + shared.sk.last_log_term(), + partial.term + ); + } + + pstate.eviction_state = EvictionState::Offloaded(shared.sk.flush_lsn()); + shared.sk.state_mut().finish_change(&pstate).await?; + // control file is now switched to Offloaded state + + // now we can switch shared.sk to Offloaded, shouldn't fail + let prev_sk = std::mem::replace(&mut shared.sk, StateSK::Empty); + let cfile_state = prev_sk.take_state(); + shared.sk = StateSK::Offloaded(Box::new(cfile_state)); + + Ok(()) + } + + /// Try to switch state Offloaded->Present. + pub(crate) async fn switch_to_present(&self) -> anyhow::Result<()> { + let conf = GlobalTimelines::get_global_config(); + let mut shared = self.write_shared_state().await; + + // trying to restore WAL storage + let wal_store = wal_storage::PhysicalStorage::new( + &self.ttid, + self.timeline_dir.clone(), + &conf, + shared.sk.state(), + )?; + + // updating control file + let mut pstate = shared.sk.state_mut().start_change(); + + if !matches!(pstate.eviction_state, EvictionState::Offloaded(_)) { + bail!( + "cannot switch to present state, current state is {:?}", + pstate.eviction_state + ); + } + + if wal_store.flush_lsn() != shared.sk.flush_lsn() { + bail!( + "flush_lsn mismatch in restored WAL, expected {}, got {}", + shared.sk.flush_lsn(), + wal_store.flush_lsn() + ); + } + + pstate.eviction_state = EvictionState::Present; + shared.sk.state_mut().finish_change(&pstate).await?; + + // now we can switch shared.sk to Present, shouldn't fail + let prev_sk = std::mem::replace(&mut shared.sk, StateSK::Empty); + let cfile_state = prev_sk.take_state(); + shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, conf.my_id)?); + + Ok(()) + } + + /// Update current manager state, useful for debugging manager deadlocks. + pub(crate) fn set_status(&self, status: timeline_manager::Status) { + self.mgr_status.store(status, Ordering::Relaxed); } } @@ -784,13 +1146,13 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result { } /// Get a path to the tenant directory. If you just need to get a timeline directory, -/// use FullAccessTimeline::get_timeline_dir instead. +/// use WalResidentTimeline::get_timeline_dir instead. pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf { conf.workdir.join(tenant_id.to_string()) } /// Get a path to the timeline directory. If you need to read WAL files from disk, -/// use FullAccessTimeline::get_timeline_dir instead. This function does not check +/// use WalResidentTimeline::get_timeline_dir instead. This function does not check /// timeline eviction status and WAL files might not be present on disk. pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf { get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string()) diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs new file mode 100644 index 0000000000..ae6f3f4b7e --- /dev/null +++ b/safekeeper/src/timeline_eviction.rs @@ -0,0 +1,386 @@ +//! Code related to evicting WAL files to remote storage. The actual upload is done by the +//! partial WAL backup code. This file has code to delete and re-download WAL files, +//! cross-validate with partial WAL backup if local file is still present. + +use anyhow::Context; +use camino::Utf8PathBuf; +use remote_storage::RemotePath; +use tokio::{ + fs::File, + io::{AsyncRead, AsyncWriteExt}, +}; +use tracing::{debug, info, instrument, warn}; +use utils::crashsafe::durable_rename; + +use crate::{ + metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED}, + rate_limit::rand_duration, + timeline_manager::{Manager, StateSnapshot}, + wal_backup, + wal_backup_partial::{self, PartialRemoteSegment}, + wal_storage::wal_file_paths, +}; + +impl Manager { + /// Returns true if the timeline is ready for eviction. + /// Current criteria: + /// - no active tasks + /// - control file is flushed (no next event scheduled) + /// - no WAL residence guards + /// - no pushes to the broker + /// - partial WAL backup is uploaded + pub(crate) fn ready_for_eviction( + &self, + next_event: &Option, + state: &StateSnapshot, + ) -> bool { + self.backup_task.is_none() + && self.recovery_task.is_none() + && self.wal_removal_task.is_none() + && self.partial_backup_task.is_none() + && self.partial_backup_uploaded.is_some() + && next_event.is_none() + && self.access_service.is_empty() + && !self.tli_broker_active.get() + && !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded) + && self + .partial_backup_uploaded + .as_ref() + .unwrap() + .flush_lsn + .segment_number(self.wal_seg_size) + == self.last_removed_segno + 1 + } + + /// Evict the timeline to remote storage. + #[instrument(name = "evict_timeline", skip_all)] + pub(crate) async fn evict_timeline(&mut self) { + assert!(!self.is_offloaded); + let partial_backup_uploaded = match &self.partial_backup_uploaded { + Some(p) => p.clone(), + None => { + warn!("no partial backup uploaded, skipping eviction"); + return; + } + }; + + info!("starting eviction, using {:?}", partial_backup_uploaded); + + EVICTION_EVENTS_STARTED + .with_label_values(&[EvictionEvent::Evict.into()]) + .inc(); + let _guard = scopeguard::guard((), |_| { + EVICTION_EVENTS_COMPLETED + .with_label_values(&[EvictionEvent::Evict.into()]) + .inc(); + }); + + if let Err(e) = do_eviction(self, &partial_backup_uploaded).await { + warn!("failed to evict timeline: {:?}", e); + return; + } + + info!("successfully evicted timeline"); + } + + /// Restore evicted timeline from remote storage. + #[instrument(name = "unevict_timeline", skip_all)] + pub(crate) async fn unevict_timeline(&mut self) { + assert!(self.is_offloaded); + let partial_backup_uploaded = match &self.partial_backup_uploaded { + Some(p) => p.clone(), + None => { + warn!("no partial backup uploaded, cannot unevict"); + return; + } + }; + + info!("starting uneviction, using {:?}", partial_backup_uploaded); + + EVICTION_EVENTS_STARTED + .with_label_values(&[EvictionEvent::Restore.into()]) + .inc(); + let _guard = scopeguard::guard((), |_| { + EVICTION_EVENTS_COMPLETED + .with_label_values(&[EvictionEvent::Restore.into()]) + .inc(); + }); + + if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await { + warn!("failed to unevict timeline: {:?}", e); + return; + } + + self.evict_not_before = + tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident); + + info!("successfully restored evicted timeline"); + } +} + +/// Ensure that content matches the remote partial backup, if local segment exists. +/// Then change state in control file and in-memory. If `delete_offloaded_wal` is set, +/// delete the local segment. +async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { + compare_local_segment_with_remote(mgr, partial).await?; + + mgr.tli.switch_to_offloaded(partial).await?; + // switch manager state as soon as possible + mgr.is_offloaded = true; + + if mgr.conf.delete_offloaded_wal { + delete_local_segment(mgr, partial).await?; + } + + Ok(()) +} + +/// Ensure that content matches the remote partial backup, if local segment exists. +/// Then download segment to local disk and change state in control file and in-memory. +async fn do_uneviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { + // if the local segment is present, validate it + compare_local_segment_with_remote(mgr, partial).await?; + + // atomically download the partial segment + redownload_partial_segment(mgr, partial).await?; + + mgr.tli.switch_to_present().await?; + // switch manager state as soon as possible + mgr.is_offloaded = false; + + Ok(()) +} + +/// Delete local WAL segment. +async fn delete_local_segment(mgr: &Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { + let local_path = local_segment_path(mgr, partial); + + info!("deleting WAL file to evict: {}", local_path); + tokio::fs::remove_file(&local_path).await?; + Ok(()) +} + +/// Redownload partial segment from remote storage. +/// The segment is downloaded to a temporary file and then renamed to the final path. +async fn redownload_partial_segment( + mgr: &Manager, + partial: &PartialRemoteSegment, +) -> anyhow::Result<()> { + let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp"); + let remote_segfile = remote_segment_path(mgr, partial)?; + + debug!( + "redownloading partial segment: {} -> {}", + remote_segfile, tmp_file + ); + + let mut reader = wal_backup::read_object(&remote_segfile, 0).await?; + let mut file = File::create(&tmp_file).await?; + + let actual_len = tokio::io::copy(&mut reader, &mut file).await?; + let expected_len = partial.flush_lsn.segment_offset(mgr.wal_seg_size); + + if actual_len != expected_len as u64 { + anyhow::bail!( + "partial downloaded {} bytes, expected {}", + actual_len, + expected_len + ); + } + + if actual_len > mgr.wal_seg_size as u64 { + anyhow::bail!( + "remote segment is too long: {} bytes, expected {}", + actual_len, + mgr.wal_seg_size + ); + } + file.set_len(mgr.wal_seg_size as u64).await?; + file.flush().await?; + + let final_path = local_segment_path(mgr, partial); + info!("downloaded {actual_len} bytes, renaming to {final_path}"); + if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await { + // Probably rename succeeded, but fsync of it failed. Remove + // the file then to avoid using it. + tokio::fs::remove_file(tmp_file) + .await + .or_else(utils::fs_ext::ignore_not_found)?; + return Err(e.into()); + } + + Ok(()) +} + +/// Compare local WAL segment with partial WAL backup in remote storage. +/// If the local segment is not present, the function does nothing. +/// If the local segment is present, it compares the local segment with the remote one. +async fn compare_local_segment_with_remote( + mgr: &Manager, + partial: &PartialRemoteSegment, +) -> anyhow::Result<()> { + let local_path = local_segment_path(mgr, partial); + + match File::open(&local_path).await { + Ok(mut local_file) => do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial) + .await + .context("validation failed"), + Err(_) => { + info!( + "local WAL file {} is not present, skipping validation", + local_path + ); + Ok(()) + } + } +} + +/// Compare opened local WAL segment with partial WAL backup in remote storage. +/// Validate full content of both files. +async fn do_validation( + mgr: &Manager, + file: &mut File, + wal_seg_size: usize, + partial: &PartialRemoteSegment, +) -> anyhow::Result<()> { + let local_size = file.metadata().await?.len() as usize; + if local_size != wal_seg_size { + anyhow::bail!( + "local segment size is invalid: found {}, expected {}", + local_size, + wal_seg_size + ); + } + + let remote_segfile = remote_segment_path(mgr, partial)?; + let mut remote_reader: std::pin::Pin> = + wal_backup::read_object(&remote_segfile, 0).await?; + + // remote segment should have bytes excatly up to `flush_lsn` + let expected_remote_size = partial.flush_lsn.segment_offset(mgr.wal_seg_size); + // let's compare the first `expected_remote_size` bytes + compare_n_bytes(&mut remote_reader, file, expected_remote_size).await?; + // and check that the remote segment ends here + check_end(&mut remote_reader).await?; + + // if local segment is longer, the rest should be zeroes + read_n_zeroes(file, mgr.wal_seg_size - expected_remote_size).await?; + // and check that the local segment ends here + check_end(file).await?; + + Ok(()) +} + +fn local_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> Utf8PathBuf { + let flush_lsn = partial.flush_lsn; + let segno = flush_lsn.segment_number(mgr.wal_seg_size); + let (_, local_partial_segfile) = + wal_file_paths(mgr.tli.timeline_dir(), segno, mgr.wal_seg_size); + local_partial_segfile +} + +fn remote_segment_path( + mgr: &Manager, + partial: &PartialRemoteSegment, +) -> anyhow::Result { + let remote_timeline_path = wal_backup::remote_timeline_path(&mgr.tli.ttid)?; + Ok(partial.remote_path(&remote_timeline_path)) +} + +/// Compare first `n` bytes of two readers. If the bytes differ, return an error. +/// If the readers are shorter than `n`, return an error. +async fn compare_n_bytes(reader1: &mut R1, reader2: &mut R2, n: usize) -> anyhow::Result<()> +where + R1: AsyncRead + Unpin, + R2: AsyncRead + Unpin, +{ + use tokio::io::AsyncReadExt; + + const BUF_SIZE: usize = 32 * 1024; + + let mut buffer1 = vec![0u8; BUF_SIZE]; + let mut buffer2 = vec![0u8; BUF_SIZE]; + + let mut offset = 0; + + while offset < n { + let bytes_to_read = std::cmp::min(BUF_SIZE, n - offset); + + let bytes_read1 = reader1 + .read(&mut buffer1[..bytes_to_read]) + .await + .with_context(|| format!("failed to read from reader1 at offset {}", offset))?; + if bytes_read1 == 0 { + anyhow::bail!("unexpected EOF from reader1 at offset {}", offset); + } + + let bytes_read2 = reader2 + .read_exact(&mut buffer2[..bytes_read1]) + .await + .with_context(|| { + format!( + "failed to read {} bytes from reader2 at offset {}", + bytes_read1, offset + ) + })?; + assert!(bytes_read2 == bytes_read1); + + if buffer1[..bytes_read1] != buffer2[..bytes_read2] { + let diff_offset = buffer1[..bytes_read1] + .iter() + .zip(buffer2[..bytes_read2].iter()) + .position(|(a, b)| a != b) + .expect("mismatched buffers, but no difference found"); + anyhow::bail!("mismatch at offset {}", offset + diff_offset); + } + + offset += bytes_read1; + } + + Ok(()) +} + +async fn check_end(mut reader: R) -> anyhow::Result<()> +where + R: AsyncRead + Unpin, +{ + use tokio::io::AsyncReadExt; + + let mut buffer = [0u8; 1]; + let bytes_read = reader.read(&mut buffer).await?; + if bytes_read != 0 { + anyhow::bail!("expected EOF, found bytes"); + } + Ok(()) +} + +async fn read_n_zeroes(reader: &mut R, n: usize) -> anyhow::Result<()> +where + R: AsyncRead + Unpin, +{ + use tokio::io::AsyncReadExt; + + const BUF_SIZE: usize = 32 * 1024; + let mut buffer = vec![0u8; BUF_SIZE]; + let mut offset = 0; + + while offset < n { + let bytes_to_read = std::cmp::min(BUF_SIZE, n - offset); + + let bytes_read = reader + .read(&mut buffer[..bytes_to_read]) + .await + .context("expected zeroes, got read error")?; + if bytes_read == 0 { + anyhow::bail!("expected zeroes, got EOF"); + } + + if buffer[..bytes_read].iter().all(|&b| b == 0) { + offset += bytes_read; + } else { + anyhow::bail!("non-zero byte found"); + } + } + + Ok(()) +} diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs new file mode 100644 index 0000000000..dbdf46412d --- /dev/null +++ b/safekeeper/src/timeline_guard.rs @@ -0,0 +1,71 @@ +//! Timeline residence guard is needed to ensure that WAL segments are present on disk, +//! as long as the code is holding the guard. This file implements guard logic, to issue +//! and drop guards, and to notify the manager when the guard is dropped. + +use std::collections::HashSet; + +use tracing::debug; + +use crate::timeline_manager::ManagerCtlMessage; + +#[derive(Debug, Clone, Copy)] +pub struct GuardId(u64); + +pub struct ResidenceGuard { + manager_tx: tokio::sync::mpsc::UnboundedSender, + guard_id: GuardId, +} + +impl Drop for ResidenceGuard { + fn drop(&mut self) { + // notify the manager that the guard is dropped + let res = self + .manager_tx + .send(ManagerCtlMessage::GuardDrop(self.guard_id)); + if let Err(e) = res { + debug!("failed to send GuardDrop message: {:?}", e); + } + } +} + +/// AccessService is responsible for issuing and dropping residence guards. +/// All guards are stored in the `guards` set. +/// TODO: it's possible to add `String` name to each guard, for better observability. +pub(crate) struct AccessService { + next_guard_id: u64, + guards: HashSet, + manager_tx: tokio::sync::mpsc::UnboundedSender, +} + +impl AccessService { + pub(crate) fn new(manager_tx: tokio::sync::mpsc::UnboundedSender) -> Self { + Self { + next_guard_id: 0, + guards: HashSet::new(), + manager_tx, + } + } + + pub(crate) fn is_empty(&self) -> bool { + self.guards.is_empty() + } + + pub(crate) fn create_guard(&mut self) -> ResidenceGuard { + let guard_id = self.next_guard_id; + self.next_guard_id += 1; + self.guards.insert(guard_id); + + let guard_id = GuardId(guard_id); + debug!("issued a new guard {:?}", guard_id); + + ResidenceGuard { + manager_tx: self.manager_tx.clone(), + guard_id, + } + } + + pub(crate) fn drop_guard(&mut self, guard_id: GuardId) { + debug!("dropping guard {:?}", guard_id); + assert!(self.guards.remove(&guard_id.0)); + } +} diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 592426bba3..482614fac7 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -2,66 +2,84 @@ //! It is spawned alongside each timeline and exits when the timeline is deleted. //! It watches for changes in the timeline state and decides when to spawn or kill background tasks. //! It also can manage some reactive state, like should the timeline be active for broker pushes or not. +//! +//! Be aware that you need to be extra careful with manager code, because it is not respawned on panic. +//! Also, if it will stuck in some branch, it will prevent any further progress in the timeline. use std::{ - sync::Arc, - time::{Duration, Instant}, + sync::{atomic::AtomicUsize, Arc}, + time::Duration, }; use postgres_ffi::XLogSegNo; -use tokio::task::{JoinError, JoinHandle}; -use tracing::{info, info_span, instrument, warn, Instrument}; +use serde::{Deserialize, Serialize}; +use tokio::{ + task::{JoinError, JoinHandle}, + time::Instant, +}; +use tracing::{debug, info, info_span, instrument, warn, Instrument}; use utils::lsn::Lsn; use crate::{ - control_file::Storage, - metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL}, + control_file::{FileStorage, Storage}, + metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS}, + rate_limit::{rand_duration, RateLimiter}, recovery::recovery_main, remove_wal::calc_horizon_lsn, + safekeeper::Term, send_wal::WalSenders, - timeline::{PeerInfo, ReadGuardSharedState, Timeline}, + state::TimelineState, + timeline::{ManagerTimeline, PeerInfo, ReadGuardSharedState, StateSK, WalResidentTimeline}, + timeline_guard::{AccessService, GuardId, ResidenceGuard}, timelines_set::{TimelineSetGuard, TimelinesSet}, wal_backup::{self, WalBackupTaskHandle}, - wal_backup_partial, SafeKeeperConf, + wal_backup_partial::{self, PartialRemoteSegment}, + SafeKeeperConf, }; -pub struct StateSnapshot { +pub(crate) struct StateSnapshot { // inmem values - pub commit_lsn: Lsn, - pub backup_lsn: Lsn, - pub remote_consistent_lsn: Lsn, + pub(crate) commit_lsn: Lsn, + pub(crate) backup_lsn: Lsn, + pub(crate) remote_consistent_lsn: Lsn, // persistent control file values - pub cfile_peer_horizon_lsn: Lsn, - pub cfile_remote_consistent_lsn: Lsn, - pub cfile_backup_lsn: Lsn, + pub(crate) cfile_peer_horizon_lsn: Lsn, + pub(crate) cfile_remote_consistent_lsn: Lsn, + pub(crate) cfile_backup_lsn: Lsn, + + // latest state + pub(crate) flush_lsn: Lsn, + pub(crate) last_log_term: Term, // misc - pub cfile_last_persist_at: Instant, - pub inmem_flush_pending: bool, - pub wal_removal_on_hold: bool, - pub peers: Vec, + pub(crate) cfile_last_persist_at: std::time::Instant, + pub(crate) inmem_flush_pending: bool, + pub(crate) wal_removal_on_hold: bool, + pub(crate) peers: Vec, } impl StateSnapshot { /// Create a new snapshot of the timeline state. fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self { + let state = read_guard.sk.state(); Self { - commit_lsn: read_guard.sk.state.inmem.commit_lsn, - backup_lsn: read_guard.sk.state.inmem.backup_lsn, - remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn, - cfile_peer_horizon_lsn: read_guard.sk.state.peer_horizon_lsn, - cfile_remote_consistent_lsn: read_guard.sk.state.remote_consistent_lsn, - cfile_backup_lsn: read_guard.sk.state.backup_lsn, - cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(), - inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard), + commit_lsn: state.inmem.commit_lsn, + backup_lsn: state.inmem.backup_lsn, + remote_consistent_lsn: state.inmem.remote_consistent_lsn, + cfile_peer_horizon_lsn: state.peer_horizon_lsn, + cfile_remote_consistent_lsn: state.remote_consistent_lsn, + cfile_backup_lsn: state.backup_lsn, + flush_lsn: read_guard.sk.flush_lsn(), + last_log_term: read_guard.sk.last_log_term(), + cfile_last_persist_at: state.pers.last_persist_at(), + inmem_flush_pending: Self::has_unflushed_inmem_state(state), wal_removal_on_hold: read_guard.wal_removal_on_hold, peers: read_guard.get_peers(heartbeat_timeout), } } - fn has_unflushed_inmem_state(read_guard: &ReadGuardSharedState) -> bool { - let state = &read_guard.sk.state; + fn has_unflushed_inmem_state(state: &TimelineState) -> bool { state.inmem.commit_lsn > state.commit_lsn || state.inmem.backup_lsn > state.backup_lsn || state.inmem.peer_horizon_lsn > state.peer_horizon_lsn @@ -73,314 +91,602 @@ impl StateSnapshot { /// There is no need to check for updates more often than this. const REFRESH_INTERVAL: Duration = Duration::from_millis(300); -/// How often to save the control file if the is no other activity. -const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300); +pub enum ManagerCtlMessage { + /// Request to get a guard for WalResidentTimeline, with WAL files available locally. + GuardRequest(tokio::sync::oneshot::Sender>), + /// Request to drop the guard. + GuardDrop(GuardId), +} + +impl std::fmt::Debug for ManagerCtlMessage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"), + ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id), + } + } +} + +pub struct ManagerCtl { + manager_tx: tokio::sync::mpsc::UnboundedSender, + + // this is used to initialize manager, it will be moved out in bootstrap(). + init_manager_rx: + std::sync::Mutex>>, +} + +impl Default for ManagerCtl { + fn default() -> Self { + Self::new() + } +} + +impl ManagerCtl { + pub fn new() -> Self { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + Self { + manager_tx: tx, + init_manager_rx: std::sync::Mutex::new(Some(rx)), + } + } + + /// Issue a new guard and wait for manager to prepare the timeline. + /// Sends a message to the manager and waits for the response. + /// Can be blocked indefinitely if the manager is stuck. + pub async fn wal_residence_guard(&self) -> anyhow::Result { + let (tx, rx) = tokio::sync::oneshot::channel(); + self.manager_tx.send(ManagerCtlMessage::GuardRequest(tx))?; + + // wait for the manager to respond with the guard + rx.await + .map_err(|e| anyhow::anyhow!("response read fail: {:?}", e)) + .and_then(std::convert::identity) + } + + /// Must be called exactly once to bootstrap the manager. + pub fn bootstrap_manager( + &self, + ) -> ( + tokio::sync::mpsc::UnboundedSender, + tokio::sync::mpsc::UnboundedReceiver, + ) { + let rx = self + .init_manager_rx + .lock() + .expect("mutex init_manager_rx poisoned") + .take() + .expect("manager already bootstrapped"); + + (self.manager_tx.clone(), rx) + } +} + +pub(crate) struct Manager { + // configuration & dependencies + pub(crate) tli: ManagerTimeline, + pub(crate) conf: SafeKeeperConf, + pub(crate) wal_seg_size: usize, + pub(crate) walsenders: Arc, + + // current state + pub(crate) state_version_rx: tokio::sync::watch::Receiver, + pub(crate) num_computes_rx: tokio::sync::watch::Receiver, + pub(crate) tli_broker_active: TimelineSetGuard, + pub(crate) last_removed_segno: XLogSegNo, + pub(crate) is_offloaded: bool, + + // background tasks + pub(crate) backup_task: Option, + pub(crate) recovery_task: Option>, + pub(crate) wal_removal_task: Option>>, + + // partial backup + pub(crate) partial_backup_task: Option>>, + pub(crate) partial_backup_uploaded: Option, + + // misc + pub(crate) access_service: AccessService, + pub(crate) global_rate_limiter: RateLimiter, + + // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not + // evict them if they go inactive very soon after being restored. + pub(crate) evict_not_before: Instant, +} /// This task gets spawned alongside each timeline and is responsible for managing the timeline's /// background tasks. /// Be careful, this task is not respawned on panic, so it should not panic. #[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))] pub async fn main_task( - tli: Arc, + tli: ManagerTimeline, conf: SafeKeeperConf, broker_active_set: Arc, + manager_tx: tokio::sync::mpsc::UnboundedSender, + mut manager_rx: tokio::sync::mpsc::UnboundedReceiver, + global_rate_limiter: RateLimiter, ) { + tli.set_status(Status::Started); + + let defer_tli = tli.tli.clone(); scopeguard::defer! { - if tli.is_cancelled() { + if defer_tli.is_cancelled() { info!("manager task finished"); } else { warn!("manager task finished prematurely"); } }; - // configuration & dependencies - let wal_seg_size = tli.get_wal_seg_size().await; - let heartbeat_timeout = conf.heartbeat_timeout; - let walsenders = tli.get_walsenders(); - let walreceivers = tli.get_walreceivers(); - - // current state - let mut state_version_rx = tli.get_state_version_rx(); - let mut num_computes_rx = walreceivers.get_num_rx(); - let mut tli_broker_active = broker_active_set.guard(tli.clone()); - let mut last_removed_segno = 0 as XLogSegNo; - - // list of background tasks - let mut backup_task: Option = None; - let mut recovery_task: Option> = None; - let mut partial_backup_task: Option> = None; - let mut wal_removal_task: Option>> = None; + let mut mgr = Manager::new( + tli, + conf, + broker_active_set, + manager_tx, + global_rate_limiter, + ) + .await; // Start recovery task which always runs on the timeline. - if conf.peer_recovery_enabled { - match tli.full_access_guard().await { - Ok(tli) => { - recovery_task = Some(tokio::spawn(recovery_main(tli, conf.clone()))); - } - Err(e) => { - warn!("failed to start recovery task: {:?}", e); - } - } - } - - // Start partial backup task which always runs on the timeline. - if conf.is_wal_backup_enabled() && conf.partial_backup_enabled { - match tli.full_access_guard().await { - Ok(tli) => { - partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task( - tli, - conf.clone(), - ))); - } - Err(e) => { - warn!("failed to start partial backup task: {:?}", e); - } - } + if !mgr.is_offloaded && mgr.conf.peer_recovery_enabled { + let tli = mgr.wal_resident_timeline(); + mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone()))); } let last_state = 'outer: loop { MANAGER_ITERATIONS_TOTAL.inc(); - let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout); - let num_computes = *num_computes_rx.borrow(); + mgr.set_status(Status::StateSnapshot); + let state_snapshot = mgr.state_snapshot().await; - let is_wal_backup_required = update_backup( - &conf, - &tli, - wal_seg_size, - num_computes, - &state_snapshot, - &mut backup_task, - ) - .await; + let mut next_event: Option = None; + if !mgr.is_offloaded { + let num_computes = *mgr.num_computes_rx.borrow(); - let _is_active = update_is_active( - is_wal_backup_required, - num_computes, - &state_snapshot, - &mut tli_broker_active, - &tli, - ); + mgr.set_status(Status::UpdateBackup); + let is_wal_backup_required = mgr.update_backup(num_computes, &state_snapshot).await; + mgr.update_is_active(is_wal_backup_required, num_computes, &state_snapshot); - let next_cfile_save = update_control_file_save(&state_snapshot, &tli).await; + mgr.set_status(Status::UpdateControlFile); + mgr.update_control_file_save(&state_snapshot, &mut next_event) + .await; - update_wal_removal( - &conf, - walsenders, - &tli, - wal_seg_size, - &state_snapshot, - last_removed_segno, - &mut wal_removal_task, - ) - .await; + mgr.set_status(Status::UpdateWalRemoval); + mgr.update_wal_removal(&state_snapshot).await; + mgr.set_status(Status::UpdatePartialBackup); + mgr.update_partial_backup(&state_snapshot).await; + + let now = Instant::now(); + if mgr.evict_not_before > now { + // we should wait until evict_not_before + update_next_event(&mut next_event, mgr.evict_not_before); + } + + if mgr.conf.enable_offload + && mgr.evict_not_before <= now + && mgr.ready_for_eviction(&next_event, &state_snapshot) + { + // check rate limiter and evict timeline if possible + match mgr.global_rate_limiter.try_acquire_eviction() { + Some(_permit) => { + mgr.set_status(Status::EvictTimeline); + mgr.evict_timeline().await; + } + None => { + // we can't evict timeline now, will try again later + mgr.evict_not_before = + Instant::now() + rand_duration(&mgr.conf.eviction_min_resident); + update_next_event(&mut next_event, mgr.evict_not_before); + } + } + } + } + + mgr.set_status(Status::Wait); // wait until something changes. tx channels are stored under Arc, so they will not be // dropped until the manager task is finished. tokio::select! { - _ = tli.cancel.cancelled() => { + _ = mgr.tli.cancel.cancelled() => { // timeline was deleted break 'outer state_snapshot; } _ = async { // don't wake up on every state change, but at most every REFRESH_INTERVAL tokio::time::sleep(REFRESH_INTERVAL).await; - let _ = state_version_rx.changed().await; + let _ = mgr.state_version_rx.changed().await; } => { // state was updated } - _ = num_computes_rx.changed() => { + _ = mgr.num_computes_rx.changed() => { // number of connected computes was updated } - _ = async { - if let Some(timeout) = next_cfile_save { - tokio::time::sleep_until(timeout).await - } else { - futures::future::pending().await - } - } => { - // it's time to save the control file + _ = sleep_until(&next_event) => { + // we were waiting for some event (e.g. cfile save) } - res = async { - if let Some(task) = &mut wal_removal_task { - task.await - } else { - futures::future::pending().await - } - } => { + res = await_task_finish(&mut mgr.wal_removal_task) => { // WAL removal task finished - wal_removal_task = None; - update_wal_removal_end(res, &tli, &mut last_removed_segno); + mgr.wal_removal_task = None; + mgr.update_wal_removal_end(res); + } + res = await_task_finish(&mut mgr.partial_backup_task) => { + // partial backup task finished + mgr.partial_backup_task = None; + mgr.update_partial_backup_end(res); + } + + msg = manager_rx.recv() => { + mgr.set_status(Status::HandleMessage); + mgr.handle_message(msg).await; } } }; + mgr.set_status(Status::Exiting); // remove timeline from the broker active set sooner, before waiting for background tasks - tli_broker_active.set(false); + mgr.tli_broker_active.set(false); // shutdown background tasks - if conf.is_wal_backup_enabled() { - wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await; + if mgr.conf.is_wal_backup_enabled() { + wal_backup::update_task(&mut mgr, false, &last_state).await; } - if let Some(recovery_task) = recovery_task { + if let Some(recovery_task) = &mut mgr.recovery_task { if let Err(e) = recovery_task.await { warn!("recovery task failed: {:?}", e); } } - if let Some(partial_backup_task) = partial_backup_task { + if let Some(partial_backup_task) = &mut mgr.partial_backup_task { if let Err(e) = partial_backup_task.await { warn!("partial backup task failed: {:?}", e); } } - if let Some(wal_removal_task) = wal_removal_task { + if let Some(wal_removal_task) = &mut mgr.wal_removal_task { let res = wal_removal_task.await; - update_wal_removal_end(res, &tli, &mut last_removed_segno); + mgr.update_wal_removal_end(res); } + + mgr.set_status(Status::Finished); } -/// Spawns/kills backup task and returns true if backup is required. -async fn update_backup( - conf: &SafeKeeperConf, - tli: &Arc, - wal_seg_size: usize, - num_computes: usize, - state: &StateSnapshot, - backup_task: &mut Option, -) -> bool { - let is_wal_backup_required = - wal_backup::is_wal_backup_required(wal_seg_size, num_computes, state); - - if conf.is_wal_backup_enabled() { - wal_backup::update_task(conf, tli, is_wal_backup_required, state, backup_task).await; +impl Manager { + async fn new( + tli: ManagerTimeline, + conf: SafeKeeperConf, + broker_active_set: Arc, + manager_tx: tokio::sync::mpsc::UnboundedSender, + global_rate_limiter: RateLimiter, + ) -> Manager { + let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await; + Manager { + wal_seg_size: tli.get_wal_seg_size().await, + walsenders: tli.get_walsenders().clone(), + state_version_rx: tli.get_state_version_rx(), + num_computes_rx: tli.get_walreceivers().get_num_rx(), + tli_broker_active: broker_active_set.guard(tli.clone()), + last_removed_segno: 0, + is_offloaded, + backup_task: None, + recovery_task: None, + wal_removal_task: None, + partial_backup_task: None, + partial_backup_uploaded, + access_service: AccessService::new(manager_tx), + tli, + global_rate_limiter, + // to smooth out evictions spike after restart + evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident), + conf, + } } - // update the state in Arc - tli.wal_backup_active - .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed); - is_wal_backup_required -} - -/// Update is_active flag and returns its value. -fn update_is_active( - is_wal_backup_required: bool, - num_computes: usize, - state: &StateSnapshot, - tli_broker_active: &mut TimelineSetGuard, - tli: &Arc, -) -> bool { - let is_active = is_wal_backup_required - || num_computes > 0 - || state.remote_consistent_lsn < state.commit_lsn; - - // update the broker timeline set - if tli_broker_active.set(is_active) { - // write log if state has changed - info!( - "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}", - is_active, state.remote_consistent_lsn, state.commit_lsn, - ); - - MANAGER_ACTIVE_CHANGES.inc(); + fn set_status(&self, status: Status) { + self.tli.set_status(status); } - // update the state in Arc - tli.broker_active - .store(is_active, std::sync::atomic::Ordering::Relaxed); - is_active -} - -/// Save control file if needed. Returns Instant if we should persist the control file in the future. -async fn update_control_file_save( - state: &StateSnapshot, - tli: &Arc, -) -> Option { - if !state.inmem_flush_pending { - return None; + /// Get a WalResidentTimeline. + /// Manager code must use this function instead of one from `Timeline` + /// directly, because it will deadlock. + pub(crate) fn wal_resident_timeline(&mut self) -> WalResidentTimeline { + assert!(!self.is_offloaded); + let guard = self.access_service.create_guard(); + WalResidentTimeline::new(self.tli.clone(), guard) } - if state.cfile_last_persist_at.elapsed() > CF_SAVE_INTERVAL { - let mut write_guard = tli.write_shared_state().await; - // this can be done in the background because it blocks manager task, but flush() should - // be fast enough not to be a problem now - if let Err(e) = write_guard.sk.state.flush().await { - warn!("failed to save control file: {:?}", e); + /// Get a snapshot of the timeline state. + async fn state_snapshot(&self) -> StateSnapshot { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["state_snapshot"]) + .start_timer(); + + StateSnapshot::new( + self.tli.read_shared_state().await, + self.conf.heartbeat_timeout, + ) + } + + /// Spawns/kills backup task and returns true if backup is required. + async fn update_backup(&mut self, num_computes: usize, state: &StateSnapshot) -> bool { + let is_wal_backup_required = + wal_backup::is_wal_backup_required(self.wal_seg_size, num_computes, state); + + if self.conf.is_wal_backup_enabled() { + wal_backup::update_task(self, is_wal_backup_required, state).await; } - None - } else { - // we should wait until next CF_SAVE_INTERVAL - Some((state.cfile_last_persist_at + CF_SAVE_INTERVAL).into()) - } -} - -/// Spawns WAL removal task if needed. -async fn update_wal_removal( - conf: &SafeKeeperConf, - walsenders: &Arc, - tli: &Arc, - wal_seg_size: usize, - state: &StateSnapshot, - last_removed_segno: u64, - wal_removal_task: &mut Option>>, -) { - if wal_removal_task.is_some() || state.wal_removal_on_hold { - // WAL removal is already in progress or hold off - return; - } - - // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon. - // This allows to get better read speed for pageservers that are lagging behind, - // at the cost of keeping more WAL on disk. - let replication_horizon_lsn = if conf.walsenders_keep_horizon { - walsenders.laggard_lsn() - } else { - None - }; - - let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn); - let removal_horizon_segno = removal_horizon_lsn - .segment_number(wal_seg_size) - .saturating_sub(1); - - if removal_horizon_segno > last_removed_segno { - // we need to remove WAL - let remover = crate::wal_storage::Storage::remove_up_to( - &tli.read_shared_state().await.sk.wal_store, - removal_horizon_segno, + // update the state in Arc + self.tli.wal_backup_active.store( + self.backup_task.is_some(), + std::sync::atomic::Ordering::Relaxed, ); - *wal_removal_task = Some(tokio::spawn( - async move { - remover.await?; - Ok(removal_horizon_segno) + is_wal_backup_required + } + + /// Update is_active flag and returns its value. + fn update_is_active( + &mut self, + is_wal_backup_required: bool, + num_computes: usize, + state: &StateSnapshot, + ) { + let is_active = is_wal_backup_required + || num_computes > 0 + || state.remote_consistent_lsn < state.commit_lsn; + + // update the broker timeline set + if self.tli_broker_active.set(is_active) { + // write log if state has changed + info!( + "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}", + is_active, state.remote_consistent_lsn, state.commit_lsn, + ); + + MANAGER_ACTIVE_CHANGES.inc(); + } + + // update the state in Arc + self.tli + .broker_active + .store(is_active, std::sync::atomic::Ordering::Relaxed); + } + + /// Save control file if needed. Returns Instant if we should persist the control file in the future. + async fn update_control_file_save( + &self, + state: &StateSnapshot, + next_event: &mut Option, + ) { + if !state.inmem_flush_pending { + return; + } + + if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval { + let mut write_guard = self.tli.write_shared_state().await; + // it should be done in the background because it blocks manager task, but flush() should + // be fast enough not to be a problem now + if let Err(e) = write_guard.sk.state_mut().flush().await { + warn!("failed to save control file: {:?}", e); } - .instrument(info_span!("WAL removal", ttid=%tli.ttid)), - )); + } else { + // we should wait until some time passed until the next save + update_next_event( + next_event, + (state.cfile_last_persist_at + self.conf.control_file_save_interval).into(), + ); + } + } + + /// Spawns WAL removal task if needed. + async fn update_wal_removal(&mut self, state: &StateSnapshot) { + if self.wal_removal_task.is_some() || state.wal_removal_on_hold { + // WAL removal is already in progress or hold off + return; + } + + // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon. + // This allows to get better read speed for pageservers that are lagging behind, + // at the cost of keeping more WAL on disk. + let replication_horizon_lsn = if self.conf.walsenders_keep_horizon { + self.walsenders.laggard_lsn() + } else { + None + }; + + let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn); + let removal_horizon_segno = removal_horizon_lsn + .segment_number(self.wal_seg_size) + .saturating_sub(1); + + if removal_horizon_segno > self.last_removed_segno { + // we need to remove WAL + let remover = match self.tli.read_shared_state().await.sk { + StateSK::Loaded(ref sk) => { + crate::wal_storage::Storage::remove_up_to(&sk.wal_store, removal_horizon_segno) + } + StateSK::Offloaded(_) => { + // we can't remove WAL if it's not loaded + warn!("unexpectedly trying to run WAL removal on offloaded timeline"); + return; + } + StateSK::Empty => unreachable!(), + }; + + self.wal_removal_task = Some(tokio::spawn( + async move { + remover.await?; + Ok(removal_horizon_segno) + } + .instrument(info_span!("WAL removal", ttid=%self.tli.ttid)), + )); + } + } + + /// Update the state after WAL removal task finished. + fn update_wal_removal_end(&mut self, res: Result, JoinError>) { + let new_last_removed_segno = match res { + Ok(Ok(segno)) => segno, + Err(e) => { + warn!("WAL removal task failed: {:?}", e); + return; + } + Ok(Err(e)) => { + warn!("WAL removal task failed: {:?}", e); + return; + } + }; + + self.last_removed_segno = new_last_removed_segno; + // update the state in Arc + self.tli + .last_removed_segno + .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed); + } + + /// Spawns partial WAL backup task if needed. + async fn update_partial_backup(&mut self, state: &StateSnapshot) { + // check if WAL backup is enabled and should be started + if !self.conf.is_wal_backup_enabled() { + return; + } + + if self.partial_backup_task.is_some() { + // partial backup is already running + return; + } + + if !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded) { + // nothing to upload + return; + } + + // Get WalResidentTimeline and start partial backup task. + self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task( + self.wal_resident_timeline(), + self.conf.clone(), + self.global_rate_limiter.clone(), + ))); + } + + /// Update the state after partial WAL backup task finished. + fn update_partial_backup_end(&mut self, res: Result, JoinError>) { + match res { + Ok(new_upload_state) => { + self.partial_backup_uploaded = new_upload_state; + } + Err(e) => { + warn!("partial backup task panicked: {:?}", e); + } + } + } + + /// Handle message arrived from ManagerCtl. + async fn handle_message(&mut self, msg: Option) { + debug!("received manager message: {:?}", msg); + match msg { + Some(ManagerCtlMessage::GuardRequest(tx)) => { + if self.is_offloaded { + // trying to unevict timeline, but without gurarantee that it will be successful + self.unevict_timeline().await; + } + + let guard = if self.is_offloaded { + Err(anyhow::anyhow!("timeline is offloaded, can't get a guard")) + } else { + Ok(self.access_service.create_guard()) + }; + + if tx.send(guard).is_err() { + warn!("failed to reply with a guard, receiver dropped"); + } + } + Some(ManagerCtlMessage::GuardDrop(guard_id)) => { + self.access_service.drop_guard(guard_id); + } + None => { + // can't happen, we're holding the sender + unreachable!(); + } + } } } -/// Update the state after WAL removal task finished. -fn update_wal_removal_end( - res: Result, JoinError>, - tli: &Arc, - last_removed_segno: &mut u64, -) { - let new_last_removed_segno = match res { - Ok(Ok(segno)) => segno, - Err(e) => { - warn!("WAL removal task failed: {:?}", e); - return; - } - Ok(Err(e)) => { - warn!("WAL removal task failed: {:?}", e); - return; - } - }; - - *last_removed_segno = new_last_removed_segno; - // update the state in Arc - tli.last_removed_segno - .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed); +// utility functions +async fn sleep_until(option: &Option) { + if let Some(timeout) = option { + tokio::time::sleep_until(*timeout).await; + } else { + futures::future::pending::<()>().await; + } +} + +async fn await_task_finish(option: &mut Option>) -> Result { + if let Some(task) = option { + task.await + } else { + futures::future::pending().await + } +} + +/// Update next_event if candidate is earlier. +fn update_next_event(next_event: &mut Option, candidate: Instant) { + if let Some(next) = next_event { + if candidate < *next { + *next = candidate; + } + } else { + *next_event = Some(candidate); + } +} + +#[repr(usize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum Status { + NotStarted, + Started, + StateSnapshot, + UpdateBackup, + UpdateControlFile, + UpdateWalRemoval, + UpdatePartialBackup, + EvictTimeline, + Wait, + HandleMessage, + Exiting, + Finished, +} + +/// AtomicStatus is a wrapper around AtomicUsize adapted for the Status enum. +pub struct AtomicStatus { + inner: AtomicUsize, +} + +impl Default for AtomicStatus { + fn default() -> Self { + Self::new() + } +} + +impl AtomicStatus { + pub fn new() -> Self { + AtomicStatus { + inner: AtomicUsize::new(Status::NotStarted as usize), + } + } + + pub fn load(&self, order: std::sync::atomic::Ordering) -> Status { + // Safety: This line of code uses `std::mem::transmute` to reinterpret the loaded value as `Status`. + // It is safe to use `transmute` in this context because `Status` is a repr(usize) enum, + // which means it has the same memory layout as usize. + // However, it is important to ensure that the loaded value is a valid variant of `Status`, + // otherwise, the behavior will be undefined. + unsafe { std::mem::transmute(self.inner.load(order)) } + } + + pub fn get(&self) -> Status { + self.load(std::sync::atomic::Ordering::Relaxed) + } + + pub fn store(&self, val: Status, order: std::sync::atomic::Ordering) { + self.inner.store(val as usize, order); + } } diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 45e08ede3c..6662e18817 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -2,6 +2,8 @@ //! All timelines should always be present in this map, this is done by loading them //! all from the disk on startup and keeping them in memory. +use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; +use crate::rate_limit::RateLimiter; use crate::safekeeper::ServerInfo; use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; use crate::timelines_set::TimelinesSet; @@ -14,15 +16,23 @@ use std::collections::HashMap; use std::str::FromStr; use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; use tracing::*; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; struct GlobalTimelinesState { timelines: HashMap>, + + // A tombstone indicates this timeline used to exist has been deleted. These are used to prevent + // on-demand timeline creation from recreating deleted timelines. This is only soft-enforced, as + // this map is dropped on restart. + tombstones: HashMap, + conf: Option, broker_active_set: Arc, load_lock: Arc>, + global_rate_limiter: RateLimiter, } // Used to prevent concurrent timeline loading. @@ -37,8 +47,12 @@ impl GlobalTimelinesState { } /// Get dependencies for a timeline constructor. - fn get_dependencies(&self) -> (SafeKeeperConf, Arc) { - (self.get_conf().clone(), self.broker_active_set.clone()) + fn get_dependencies(&self) -> (SafeKeeperConf, Arc, RateLimiter) { + ( + self.get_conf().clone(), + self.broker_active_set.clone(), + self.global_rate_limiter.clone(), + ) } /// Insert timeline into the map. Returns error if timeline with the same id already exists. @@ -58,14 +72,21 @@ impl GlobalTimelinesState { .cloned() .ok_or(TimelineError::NotFound(*ttid)) } + + fn delete(&mut self, ttid: TenantTimelineId) { + self.timelines.remove(&ttid); + self.tombstones.insert(ttid, Instant::now()); + } } static TIMELINES_STATE: Lazy> = Lazy::new(|| { Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), + tombstones: HashMap::new(), conf: None, broker_active_set: Arc::new(TimelinesSet::default()), load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)), + global_rate_limiter: RateLimiter::new(1, 1), }) }); @@ -79,6 +100,10 @@ impl GlobalTimelines { // lock, so use explicit block let tenants_dir = { let mut state = TIMELINES_STATE.lock().unwrap(); + state.global_rate_limiter = RateLimiter::new( + conf.partial_backup_concurrency, + DEFAULT_EVICTION_CONCURRENCY, + ); state.conf = Some(conf); // Iterate through all directories and load tenants for all directories @@ -122,7 +147,7 @@ impl GlobalTimelines { /// this function is called during init when nothing else is running, so /// this is fine. async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> { - let (conf, broker_active_set) = { + let (conf, broker_active_set, partial_backup_rate_limiter) = { let state = TIMELINES_STATE.lock().unwrap(); state.get_dependencies() }; @@ -145,7 +170,11 @@ impl GlobalTimelines { .unwrap() .timelines .insert(ttid, tli.clone()); - tli.bootstrap(&conf, broker_active_set.clone()); + tli.bootstrap( + &conf, + broker_active_set.clone(), + partial_backup_rate_limiter.clone(), + ); } // If we can't load a timeline, it's most likely because of a corrupted // directory. We will log an error and won't allow to delete/recreate @@ -178,20 +207,27 @@ impl GlobalTimelines { _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>, ttid: TenantTimelineId, ) -> Result> { - let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies(); + let (conf, broker_active_set, partial_backup_rate_limiter) = + TIMELINES_STATE.lock().unwrap().get_dependencies(); match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); // TODO: prevent concurrent timeline creation/loading - TIMELINES_STATE - .lock() - .unwrap() - .timelines - .insert(ttid, tli.clone()); + { + let mut state = TIMELINES_STATE.lock().unwrap(); - tli.bootstrap(&conf, broker_active_set); + // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`). We trust + // that the human doing this manual intervention knows what they are doing, and remove its tombstone. + if state.tombstones.remove(&ttid).is_some() { + warn!("Un-deleted timeline {ttid}"); + } + + state.timelines.insert(ttid, tli.clone()); + } + + tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter); Ok(tli) } @@ -216,18 +252,23 @@ impl GlobalTimelines { /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. - pub async fn create( + pub(crate) async fn create( ttid: TenantTimelineId, server_info: ServerInfo, commit_lsn: Lsn, local_start_lsn: Lsn, ) -> Result> { - let (conf, broker_active_set) = { + let (conf, broker_active_set, partial_backup_rate_limiter) = { let state = TIMELINES_STATE.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { // Timeline already exists, return it. return Ok(timeline); } + + if state.tombstones.contains_key(&ttid) { + anyhow::bail!("Timeline {ttid} is deleted, refusing to recreate"); + } + state.get_dependencies() }; @@ -257,7 +298,12 @@ impl GlobalTimelines { // Bootstrap is transactional, so if it fails, the timeline will be deleted, // and the state on disk should remain unchanged. if let Err(e) = timeline - .init_new(&mut shared_state, &conf, broker_active_set) + .init_new( + &mut shared_state, + &conf, + broker_active_set, + partial_backup_rate_limiter, + ) .await { // Note: the most likely reason for init failure is that the timeline @@ -282,17 +328,19 @@ impl GlobalTimelines { /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid, /// i.e. loaded in memory and not cancelled. - pub fn get(ttid: TenantTimelineId) -> Result, TimelineError> { - let res = TIMELINES_STATE.lock().unwrap().get(&ttid); - - match res { + pub(crate) fn get(ttid: TenantTimelineId) -> Result, TimelineError> { + let tli_res = { + let state = TIMELINES_STATE.lock().unwrap(); + state.get(&ttid) + }; + match tli_res { Ok(tli) => { if tli.is_cancelled() { return Err(TimelineError::Cancelled(ttid)); } Ok(tli) } - _ => res, + _ => tli_res, } } @@ -321,12 +369,26 @@ impl GlobalTimelines { /// Cancels timeline, then deletes the corresponding data directory. /// If only_local, doesn't remove WAL segments in remote storage. - pub async fn delete( + pub(crate) async fn delete( ttid: &TenantTimelineId, only_local: bool, ) -> Result { - let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); - match tli_res { + let tli_res = { + let state = TIMELINES_STATE.lock().unwrap(); + + if state.tombstones.contains_key(ttid) { + // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do. + info!("Timeline {ttid} was already deleted"); + return Ok(TimelineDeleteForceResult { + dir_existed: false, + was_active: false, + }); + } + + state.get(ttid) + }; + + let result = match tli_res { Ok(timeline) => { let was_active = timeline.broker_active.load(Ordering::Relaxed); @@ -336,11 +398,6 @@ impl GlobalTimelines { info!("deleting timeline {}, only_local={}", ttid, only_local); let dir_existed = timeline.delete(&mut shared_state, only_local).await?; - // Remove timeline from the map. - // FIXME: re-enable it once we fix the issue with recreation of deleted timelines - // https://github.com/neondatabase/neon/issues/3146 - // TIMELINES_STATE.lock().unwrap().timelines.remove(ttid); - Ok(TimelineDeleteForceResult { dir_existed, was_active, // TODO: we probably should remove this field @@ -356,7 +413,14 @@ impl GlobalTimelines { was_active: false, }) } - } + }; + + // Finalize deletion, by dropping Timeline objects and storing smaller tombstones. The tombstones + // are used to prevent still-running computes from re-creating the same timeline when they send data, + // and to speed up repeated deletion calls by avoiding re-listing objects. + TIMELINES_STATE.lock().unwrap().delete(*ttid); + + result } /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which @@ -402,19 +466,20 @@ impl GlobalTimelines { tenant_id, ))?; - // FIXME: we temporarily disabled removing timelines from the map, see `delete_force` - // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); - // if !tlis_after_delete.is_empty() { - // // Some timelines were created while we were deleting them, returning error - // // to the caller, so it can retry later. - // bail!( - // "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them", - // tenant_id - // ); - // } - Ok(deleted) } + + pub fn housekeeping(tombstone_ttl: &Duration) { + let mut state = TIMELINES_STATE.lock().unwrap(); + + // We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted + // timelines. If a compute kept running for longer than this TTL (or across a safekeeper restart) then they + // may recreate a deleted timeline. + let now = Instant::now(); + state + .tombstones + .retain(|_, v| now.duration_since(*v) < *tombstone_ttl); + } } #[derive(Clone, Copy, Serialize)] diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs index ea8e23bb72..d6eea79f82 100644 --- a/safekeeper/src/timelines_set.rs +++ b/safekeeper/src/timelines_set.rs @@ -80,6 +80,10 @@ impl TimelineSetGuard { self.timelines_set.set_present(self.tli.clone(), present); true } + + pub fn get(&self) -> bool { + self.is_present + } } impl Drop for TimelineSetGuard { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 58591aecfa..aa1a6696a1 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -12,7 +12,6 @@ use std::cmp::min; use std::collections::HashSet; use std::num::NonZeroU32; use std::pin::Pin; -use std::sync::Arc; use std::time::Duration; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; @@ -23,19 +22,17 @@ use tokio::fs::File; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::watch; +use tokio::sync::{watch, OnceCell}; use tokio::time::sleep; use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; -use crate::timeline::{FullAccessTimeline, PeerInfo, Timeline}; -use crate::timeline_manager::StateSnapshot; +use crate::timeline::{PeerInfo, WalResidentTimeline}; +use crate::timeline_manager::{Manager, StateSnapshot}; use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME}; -use once_cell::sync::OnceCell; - const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; @@ -48,7 +45,7 @@ pub struct WalBackupTaskHandle { } /// Do we have anything to upload to S3, i.e. should safekeepers run backup activity? -pub fn is_wal_backup_required( +pub(crate) fn is_wal_backup_required( wal_seg_size: usize, num_computes: usize, state: &StateSnapshot, @@ -61,35 +58,33 @@ pub fn is_wal_backup_required( /// Based on peer information determine which safekeeper should offload; if it /// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task /// is running, kill it. -pub async fn update_task( - conf: &SafeKeeperConf, - tli: &Arc, - need_backup: bool, - state: &StateSnapshot, - entry: &mut Option, -) { +pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) { let (offloader, election_dbg_str) = - determine_offloader(&state.peers, state.backup_lsn, tli.ttid, conf); - let elected_me = Some(conf.my_id) == offloader; + determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf); + let elected_me = Some(mgr.conf.my_id) == offloader; let should_task_run = need_backup && elected_me; // start or stop the task - if should_task_run != (entry.is_some()) { + if should_task_run != (mgr.backup_task.is_some()) { if should_task_run { info!("elected for backup: {}", election_dbg_str); let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let async_task = backup_task_main(tli.clone(), conf.backup_parallel_jobs, shutdown_rx); + let async_task = backup_task_main( + mgr.wal_resident_timeline(), + mgr.conf.backup_parallel_jobs, + shutdown_rx, + ); - let handle = if conf.current_thread_runtime { + let handle = if mgr.conf.current_thread_runtime { tokio::spawn(async_task) } else { WAL_BACKUP_RUNTIME.spawn(async_task) }; - *entry = Some(WalBackupTaskHandle { + mgr.backup_task = Some(WalBackupTaskHandle { shutdown_tx, handle, }); @@ -101,7 +96,7 @@ pub async fn update_task( // someone else has been elected info!("stepping down from backup: {}", election_dbg_str); } - shut_down_task(entry).await; + shut_down_task(&mut mgr.backup_task).await; } } } @@ -122,6 +117,7 @@ async fn shut_down_task(entry: &mut Option) { /// time we have several ones as they PUT the same files. Also, /// - frequently changing the offloader would be bad; /// - electing seriously lagging safekeeper is undesirable; +/// /// So we deterministically choose among the reasonably caught up candidates. /// TODO: take into account failed attempts to deal with hypothetical situation /// where s3 is unreachable only for some sks. @@ -169,7 +165,7 @@ fn determine_offloader( } } -static REMOTE_STORAGE: OnceCell> = OnceCell::new(); +static REMOTE_STORAGE: OnceCell> = OnceCell::const_new(); // Storage must be configured and initialized when this is called. fn get_configured_remote_storage() -> &'static GenericRemoteStorage { @@ -180,18 +176,26 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage { .unwrap() } -pub fn init_remote_storage(conf: &SafeKeeperConf) { +pub async fn init_remote_storage(conf: &SafeKeeperConf) { // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide // dependencies to all tasks instead. - REMOTE_STORAGE.get_or_init(|| { - conf.remote_storage - .as_ref() - .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) - }); + REMOTE_STORAGE + .get_or_init(|| async { + if let Some(conf) = conf.remote_storage.as_ref() { + Some( + GenericRemoteStorage::from_config(conf) + .await + .expect("failed to create remote storage"), + ) + } else { + None + } + }) + .await; } struct WalBackupTask { - timeline: FullAccessTimeline, + timeline: WalResidentTimeline, timeline_dir: Utf8PathBuf, wal_seg_size: usize, parallel_jobs: usize, @@ -200,16 +204,12 @@ struct WalBackupTask { /// Offload single timeline. #[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))] -async fn backup_task_main(tli: Arc, parallel_jobs: usize, mut shutdown_rx: Receiver<()>) { +async fn backup_task_main( + tli: WalResidentTimeline, + parallel_jobs: usize, + mut shutdown_rx: Receiver<()>, +) { let _guard = WAL_BACKUP_TASKS.guard(); - - let tli = match tli.full_access_guard().await { - Ok(tli) => tli, - Err(e) => { - error!("backup error: {}", e); - return; - } - }; info!("started"); let mut wb = WalBackupTask { @@ -304,7 +304,7 @@ impl WalBackupTask { } async fn backup_lsn_range( - timeline: &FullAccessTimeline, + timeline: &WalResidentTimeline, backup_lsn: &mut Lsn, end_lsn: Lsn, wal_seg_size: usize, @@ -483,6 +483,16 @@ pub(crate) async fn backup_partial_segment( .await } +pub(crate) async fn copy_partial_segment( + source: &RemotePath, + destination: &RemotePath, +) -> Result<()> { + let storage = get_configured_remote_storage(); + let cancel = CancellationToken::new(); + + storage.copy_object(source, destination, &cancel).await +} + pub async fn read_object( file_path: &RemotePath, offset: u64, @@ -545,7 +555,10 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { &cancel, ) .await? - .keys; + .keys + .into_iter() + .map(|o| o.key) + .collect::>(); if files.is_empty() { return Ok(()); // done } @@ -613,7 +626,7 @@ pub async fn copy_s3_segments( let uploaded_segments = &files .iter() - .filter_map(|file| file.object_name().map(ToOwned::to_owned)) + .filter_map(|o| o.key.object_name().map(ToOwned::to_owned)) .collect::>(); debug!( diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index ed5ddb71f5..675a051887 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -17,30 +17,34 @@ //! file. Code updates state in the control file before doing any S3 operations. //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. - use camino::Utf8PathBuf; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; use tracing::{debug, error, info, instrument, warn}; -use utils::lsn::Lsn; +use utils::{id::NodeId, lsn::Lsn}; use crate::{ - metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, + metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, + rate_limit::{rand_duration, RateLimiter}, safekeeper::Term, - timeline::FullAccessTimeline, + timeline::WalResidentTimeline, + timeline_manager::StateSnapshot, wal_backup::{self, remote_timeline_path}, SafeKeeperConf, }; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum UploadStatus { - /// Upload is in progress + /// Upload is in progress. This status should be used only for garbage collection, + /// don't read data from the remote storage with this status. InProgress, - /// Upload is finished + /// Upload is finished. There is always at most one segment with this status. + /// It means that the segment is actual and can be used. Uploaded, - /// Deletion is in progress + /// Deletion is in progress. This status should be used only for garbage collection, + /// don't read data from the remote storage with this status. Deleting, } @@ -50,6 +54,10 @@ pub struct PartialRemoteSegment { pub name: String, pub commit_lsn: Lsn, pub flush_lsn: Lsn, + // We should use last_log_term here, otherwise it's possible to have inconsistent data in the + // remote storage. + // + // More info here: https://github.com/neondatabase/neon/pull/8022#discussion_r1654738405 pub term: Term, } @@ -60,6 +68,10 @@ impl PartialRemoteSegment { && self.flush_lsn == other.flush_lsn && self.term == other.term } + + pub(crate) fn remote_path(&self, remote_timeline_path: &RemotePath) -> RemotePath { + remote_timeline_path.join(&self.name) + } } // NB: these structures are a part of a control_file, you can't change them without @@ -69,19 +81,73 @@ pub struct State { pub segments: Vec, } +#[derive(Debug)] +pub(crate) struct ReplaceUploadedSegment { + pub(crate) previous: PartialRemoteSegment, + pub(crate) current: PartialRemoteSegment, +} + impl State { /// Find an Uploaded segment. There should be only one Uploaded segment at a time. - fn uploaded_segment(&self) -> Option { + pub(crate) fn uploaded_segment(&self) -> Option { self.segments .iter() .find(|seg| seg.status == UploadStatus::Uploaded) .cloned() } + + /// Replace the name of the Uploaded segment (if one exists) in order to match + /// it with `destination` safekeeper. Returns a description of the change or None + /// wrapped in anyhow::Result. + pub(crate) fn replace_uploaded_segment( + &mut self, + source: NodeId, + destination: NodeId, + ) -> anyhow::Result> { + let current = self + .segments + .iter_mut() + .find(|seg| seg.status == UploadStatus::Uploaded); + + let current = match current { + Some(some) => some, + None => { + return anyhow::Ok(None); + } + }; + + // Sanity check that the partial segment we are replacing is belongs + // to the `source` SK. + if !current + .name + .ends_with(format!("sk{}.partial", source.0).as_str()) + { + anyhow::bail!( + "Partial segment name ({}) doesn't match self node id ({})", + current.name, + source + ); + } + + let previous = current.clone(); + + let new_name = current.name.replace( + format!("_sk{}", source.0).as_str(), + format!("_sk{}", destination.0).as_str(), + ); + + current.name = new_name; + + anyhow::Ok(Some(ReplaceUploadedSegment { + previous, + current: current.clone(), + })) + } } struct PartialBackup { wal_seg_size: usize, - tli: FullAccessTimeline, + tli: WalResidentTimeline, conf: SafeKeeperConf, local_prefix: Utf8PathBuf, remote_timeline_path: RemotePath, @@ -128,17 +194,17 @@ impl PartialBackup { let sk_info = self.tli.get_safekeeper_info(&self.conf).await; let flush_lsn = Lsn(sk_info.flush_lsn); let commit_lsn = Lsn(sk_info.commit_lsn); - let term = sk_info.term; + let last_log_term = sk_info.last_log_term; let segno = self.segno(flush_lsn); - let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn); + let name = self.remote_segment_name(segno, last_log_term, commit_lsn, flush_lsn); PartialRemoteSegment { status: UploadStatus::InProgress, name, commit_lsn, flush_lsn, - term, + term: last_log_term, } } @@ -151,7 +217,7 @@ impl PartialBackup { let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size); let local_path = self.local_prefix.join(self.local_segment_name(segno)); - let remote_path = self.remote_timeline_path.join(&prepared.name); + let remote_path = prepared.remote_path(&self.remote_timeline_path); // Upload first `backup_bytes` bytes of the segment to the remote storage. wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?; @@ -161,7 +227,7 @@ impl PartialBackup { // If the term changed, we cannot guarantee the validity of the uploaded data. // If the term is the same, we know the data is not corrupted. let sk_info = self.tli.get_safekeeper_info(&self.conf).await; - if sk_info.term != prepared.term { + if sk_info.last_log_term != prepared.term { anyhow::bail!("term changed during upload"); } assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn)); @@ -196,6 +262,9 @@ impl PartialBackup { /// Upload the latest version of the partial segment and garbage collect older versions. #[instrument(name = "upload", skip_all, fields(name = %prepared.name))] async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["partial_do_upload"]) + .start_timer(); info!("starting upload {:?}", prepared); let state_0 = self.state.clone(); @@ -248,6 +317,18 @@ impl PartialBackup { }) .collect(); + if new_segments.len() == 1 { + // we have an uploaded segment, it must not be deleted from remote storage + segments_to_delete.retain(|name| name != &new_segments[0].name); + } else { + // there should always be zero or one uploaded segment + assert!( + new_segments.is_empty(), + "too many uploaded segments: {:?}", + new_segments + ); + } + info!("deleting objects: {:?}", segments_to_delete); let mut objects_to_delete = vec![]; for seg in segments_to_delete.iter() { @@ -270,10 +351,36 @@ impl PartialBackup { } } +/// Check if everything is uploaded and partial backup task doesn't need to run. +pub(crate) fn needs_uploading( + state: &StateSnapshot, + uploaded: &Option, +) -> bool { + match uploaded { + Some(uploaded) => { + uploaded.status != UploadStatus::Uploaded + || uploaded.flush_lsn != state.flush_lsn + || uploaded.commit_lsn != state.commit_lsn + || uploaded.term != state.last_log_term + } + None => true, + } +} + +/// Main task for partial backup. It waits for the flush_lsn to change and then uploads the +/// partial segment to the remote storage. It also does garbage collection of old segments. +/// +/// When there is nothing more to do and the last segment was successfully uploaded, the task +/// returns PartialRemoteSegment, to signal readiness for offloading the timeline. #[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))] -pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) { +pub async fn main_task( + tli: WalResidentTimeline, + conf: SafeKeeperConf, + limiter: RateLimiter, +) -> Option { debug!("started"); let await_duration = conf.partial_backup_timeout; + let mut first_iteration = true; let (_, persistent_state) = tli.get_state().await; let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); @@ -285,7 +392,7 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) { Ok(path) => path, Err(e) => { error!("failed to create remote path: {:?}", e); - return; + return None; } }; @@ -320,19 +427,13 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) { // wait until we have something to upload let uploaded_segment = backup.state.uploaded_segment(); if let Some(seg) = &uploaded_segment { - // if we already uploaded something, wait until we have something new - while flush_lsn_rx.borrow().lsn == seg.flush_lsn + // check if uploaded segment matches the current state + if flush_lsn_rx.borrow().lsn == seg.flush_lsn && *commit_lsn_rx.borrow() == seg.commit_lsn && flush_lsn_rx.borrow().term == seg.term { - tokio::select! { - _ = backup.tli.cancel.cancelled() => { - info!("timeline canceled"); - return; - } - _ = commit_lsn_rx.changed() => {} - _ = flush_lsn_rx.changed() => {} - } + // we have nothing to do, the last segment is already uploaded + return Some(seg.clone()); } } @@ -341,12 +442,21 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) { tokio::select! { _ = backup.tli.cancel.cancelled() => { info!("timeline canceled"); - return; + return None; } _ = flush_lsn_rx.changed() => {} } } + // smoothing the load after restart, by sleeping for a random time. + // if this is not the first iteration, we will wait for the full await_duration + let await_duration = if first_iteration { + first_iteration = false; + rand_duration(&await_duration) + } else { + await_duration + }; + // fixing the segno and waiting some time to prevent reuploading the same segment too often let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn); let timeout = tokio::time::sleep(await_duration); @@ -358,7 +468,7 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) { tokio::select! { _ = backup.tli.cancel.cancelled() => { info!("timeline canceled"); - return; + return None; } _ = commit_lsn_rx.changed() => {} _ = flush_lsn_rx.changed() => { @@ -381,6 +491,9 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) { continue 'outer; } + // limit concurrent uploads + let _upload_permit = limiter.acquire_partial_backup().await; + let prepared = backup.prepare_upload().await; if let Some(seg) = &uploaded_segment { if seg.eq_without_status(&prepared) { diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 4a97eb3993..16f7748eb4 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -4,9 +4,10 @@ //! use anyhow::{Context, Result}; use postgres_backend::QueryError; -use std::{future, time::Duration}; +use std::time::Duration; use tokio::net::TcpStream; use tokio_io_timeout::TimeoutReader; +use tokio_util::sync::CancellationToken; use tracing::*; use utils::{auth::Scope, measured_stream::MeasuredStream}; @@ -42,7 +43,7 @@ pub async fn task_main( error!("connection handler exited: {}", err); } } - .instrument(info_span!("", cid = %conn_id, ttid = field::Empty)), + .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)), ); } } @@ -100,7 +101,7 @@ async fn handle_socket( // libpq protocol between safekeeper and walproposer / pageserver // We don't use shutdown. pgbackend - .run(&mut conn_handler, future::pending::<()>) + .run(&mut conn_handler, &CancellationToken::new()) .await } diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 0c1731937c..ded8571a3e 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -23,7 +23,9 @@ use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tracing::*; use utils::crashsafe::durable_rename; -use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS}; +use crate::metrics::{ + time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, +}; use crate::state::TimelinePersistentState; use crate::wal_backup::{read_object, remote_timeline_path}; use crate::SafeKeeperConf; @@ -211,7 +213,7 @@ impl PhysicalStorage { /// Returns `file` and `is_partial`. async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size); // Try to open already completed segment if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path).await { @@ -231,11 +233,7 @@ impl PhysicalStorage { // half initialized segment, first bake it under tmp filename and // then rename. let tmp_path = self.timeline_dir.join("waltmp"); - #[allow(clippy::suspicious_open_options)] - let mut file = OpenOptions::new() - .create(true) - .write(true) - .open(&tmp_path) + let mut file = File::create(&tmp_path) .await .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?; @@ -280,7 +278,7 @@ impl PhysicalStorage { // Rename partial file to completed file let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size); fs::rename(wal_file_partial_path, wal_file_path).await?; } else { // otherwise, file can be reused later @@ -335,6 +333,10 @@ impl Storage for PhysicalStorage { } async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["initialize_first_segment"]) + .start_timer(); + let segno = init_lsn.segment_number(self.wal_seg_size); let (mut file, _) = self.open_or_create(segno).await?; let major_pg_version = self.pg_version / 10000; @@ -426,6 +428,10 @@ impl Storage for PhysicalStorage { /// Truncate written WAL by removing all WAL segments after the given LSN. /// end_pos must point to the end of the WAL record. async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["truncate_wal"]) + .start_timer(); + // Streaming must not create a hole, so truncate cannot be called on non-written lsn if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { bail!( @@ -465,7 +471,7 @@ impl Storage for PhysicalStorage { if !is_partial { // Make segment partial once again let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size); fs::rename(wal_file_path, wal_file_partial_path).await?; } @@ -501,6 +507,10 @@ async fn remove_segments_from_disk( wal_seg_size: usize, remove_predicate: impl Fn(XLogSegNo) -> bool, ) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["remove_segments_from_disk"]) + .start_timer(); + let mut n_removed = 0; let mut min_removed = u64::MAX; let mut max_removed = u64::MIN; @@ -745,7 +755,7 @@ pub(crate) async fn open_wal_file( segno: XLogSegNo, wal_seg_size: usize, ) -> Result<(tokio::fs::File, bool)> { - let (wal_file_path, wal_file_partial_path) = wal_file_paths(timeline_dir, segno, wal_seg_size)?; + let (wal_file_path, wal_file_partial_path) = wal_file_paths(timeline_dir, segno, wal_seg_size); // First try to open the .partial file. let mut partial_path = wal_file_path.to_owned(); @@ -771,9 +781,9 @@ pub fn wal_file_paths( timeline_dir: &Utf8Path, segno: XLogSegNo, wal_seg_size: usize, -) -> Result<(Utf8PathBuf, Utf8PathBuf)> { +) -> (Utf8PathBuf, Utf8PathBuf) { let wal_file_name = XLogFileName(PG_TLI, segno, wal_seg_size); let wal_file_path = timeline_dir.join(wal_file_name.clone()); let wal_file_partial_path = timeline_dir.join(wal_file_name + ".partial"); - Ok((wal_file_path, wal_file_partial_path)) + (wal_file_path, wal_file_partial_path) } diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs index 6c6f6a8c96..7bdee35cd7 100644 --- a/safekeeper/tests/random_test.rs +++ b/safekeeper/tests/random_test.rs @@ -10,7 +10,7 @@ use crate::walproposer_sim::{ pub mod walproposer_sim; // Generates 2000 random seeds and runs a schedule for each of them. -// If you seed this test fail, please report the last seed to the +// If you see this test fail, please report the last seed to the // @safekeeper team. #[test] fn test_random_schedules() -> anyhow::Result<()> { diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 47539872a6..771d905c90 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -16,12 +16,12 @@ use desim::{ use hyper::Uri; use safekeeper::{ safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION}, - state::TimelinePersistentState, + state::{TimelinePersistentState, TimelineState}, timeline::TimelineError, wal_storage::Storage, SafeKeeperConf, }; -use tracing::{debug, info_span}; +use tracing::{debug, info_span, warn}; use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, @@ -68,7 +68,7 @@ impl GlobalMap { let control_store = DiskStateStorage::new(disk.clone()); let wal_store = DiskWALStorage::new(disk.clone(), &control_store)?; - let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; + let sk = SafeKeeper::new(TimelineState::new(control_store), wal_store, conf.my_id)?; timelines.insert( ttid, SharedState { @@ -118,7 +118,11 @@ impl GlobalMap { let control_store = DiskStateStorage::new(disk_timeline.clone()); let wal_store = DiskWALStorage::new(disk_timeline.clone(), &control_store)?; - let sk = SafeKeeper::new(control_store, wal_store, self.conf.my_id)?; + let sk = SafeKeeper::new( + TimelineState::new(control_store), + wal_store, + self.conf.my_id, + )?; self.timelines.insert( ttid, @@ -177,9 +181,13 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { sk_auth_token: None, current_thread_runtime: false, walsenders_keep_horizon: false, - partial_backup_enabled: false, partial_backup_timeout: Duration::from_secs(0), disable_periodic_broker_push: false, + enable_offload: false, + delete_offloaded_wal: false, + control_file_save_interval: Duration::from_secs(1), + partial_backup_concurrency: 1, + eviction_min_resident: Duration::ZERO, }; let mut global = GlobalMap::new(disk, conf.clone())?; @@ -247,7 +255,12 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { NetEvent::Message(msg) => { let res = conn.process_any(msg, &mut global); if res.is_err() { - debug!("conn {:?} error: {:#}", connection_id, res.unwrap_err()); + let e = res.unwrap_err(); + let estr = e.to_string(); + if !estr.contains("finished processing START_REPLICATION") { + warn!("conn {:?} error: {:?}", connection_id, e); + panic!("unexpected error at safekeeper: {:#}", e); + } conns.remove(&connection_id); break; } diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs index aa329bd2f0..123cd6bad6 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -172,7 +172,7 @@ fn write_walrecord_to_disk( let mut freespace = insert_freespace(curr_ptr); let mut written: usize = 0; - assert!(freespace >= std::mem::size_of::()); + assert!(freespace >= size_of::()); for mut rdata in rdatas { while rdata.len() >= freespace { diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py index 01f34a1b96..4ca433679a 100755 --- a/scripts/benchmark_durations.py +++ b/scripts/benchmark_durations.py @@ -67,6 +67,7 @@ FALLBACK_DURATION = { "test_runner/performance/test_copy.py::test_copy[neon]": 13.817, "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736, "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735, + "test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735, "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868, "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393, "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588, diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py index cff1d9875f..40d7254e00 100644 --- a/scripts/ingest_regress_test_result-new-format.py +++ b/scripts/ingest_regress_test_result-new-format.py @@ -18,6 +18,7 @@ import psycopg2 from psycopg2.extras import execute_values CREATE_TABLE = """ +CREATE TYPE arch AS ENUM ('ARM64', 'X64', 'UNKNOWN'); CREATE TABLE IF NOT EXISTS results ( id BIGSERIAL PRIMARY KEY, parent_suite TEXT NOT NULL, @@ -28,6 +29,7 @@ CREATE TABLE IF NOT EXISTS results ( stopped_at TIMESTAMPTZ NOT NULL, duration INT NOT NULL, flaky BOOLEAN NOT NULL, + arch arch DEFAULT 'X64', build_type TEXT NOT NULL, pg_version INT NOT NULL, run_id BIGINT NOT NULL, @@ -35,7 +37,7 @@ CREATE TABLE IF NOT EXISTS results ( reference TEXT NOT NULL, revision CHAR(40) NOT NULL, raw JSONB COMPRESSION lz4 NOT NULL, - UNIQUE (parent_suite, suite, name, build_type, pg_version, started_at, stopped_at, run_id) + UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id) ); """ @@ -50,6 +52,7 @@ class Row: stopped_at: datetime duration: int flaky: bool + arch: str build_type: str pg_version: int run_id: int @@ -121,6 +124,14 @@ def ingest_test_result( raw.pop("labels") raw.pop("extra") + # All allure parameters are prefixed with "__", see test_runner/fixtures/parametrize.py + parameters = { + p["name"].removeprefix("__"): p["value"] + for p in test["parameters"] + if p["name"].startswith("__") + } + arch = parameters.get("arch", "UNKNOWN").strip("'") + build_type, pg_version, unparametrized_name = parse_test_name(test["name"]) labels = {label["name"]: label["value"] for label in test["labels"]} row = Row( @@ -132,6 +143,7 @@ def ingest_test_result( stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc), duration=test["time"]["duration"], flaky=test["flaky"] or test["retriesStatusChange"], + arch=arch, build_type=build_type, pg_version=pg_version, run_id=run_id, diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store index 1f88f252eb..7c383e322f 100755 --- a/scripts/ps_ec2_setup_instance_store +++ b/scripts/ps_ec2_setup_instance_store @@ -44,7 +44,7 @@ run the following commands from the top of the neon.git checkout # test suite run export TEST_OUTPUT="$TEST_OUTPUT" - DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py + DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py # for interactive use export NEON_REPO_DIR="$NEON_REPO_DIR" diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 0a4af543ab..15acd0e49c 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -642,8 +642,7 @@ async fn main() -> Result<(), Box> { logging::replace_panic_hook_with_tracing_panic_hook().forget(); // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - info!("version: {GIT_VERSION}"); - info!("build_tag: {BUILD_TAG}"); + info!("version: {GIT_VERSION} build_tag: {BUILD_TAG}"); metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); // On any shutdown signal, log receival and exit. diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index b54dea5d47..ecaac04915 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -18,6 +18,7 @@ anyhow.workspace = true aws-config.workspace = true bytes.workspace = true camino.workspace = true +chrono.workspace = true clap.workspace = true fail.workspace = true futures.workspace = true @@ -31,6 +32,7 @@ once_cell.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true postgres_connection.workspace = true +rand.workspace = true reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true serde.workspace = true @@ -44,7 +46,12 @@ scopeguard.workspace = true strum.workspace = true strum_macros.workspace = true -diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] } +diesel = { version = "2.1.4", features = [ + "serde_json", + "postgres", + "r2d2", + "chrono", +] } diesel_migrations = { version = "2.1.0" } r2d2 = { version = "0.8.10" } @@ -52,4 +59,3 @@ utils = { path = "../libs/utils/" } metrics = { path = "../libs/metrics/" } control_plane = { path = "../control_plane" } workspace_hack = { version = "0.1", path = "../workspace_hack" } - diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml new file mode 100644 index 0000000000..c3bfe2bfd2 --- /dev/null +++ b/storage_controller/client/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "storage_controller_client" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +pageserver_api.workspace = true +pageserver_client.workspace = true +thiserror.workspace = true +async-trait.workspace = true +reqwest.workspace = true +utils.workspace = true +serde.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } +tokio-postgres.workspace = true +tokio-stream.workspace = true +tokio.workspace = true +futures.workspace = true +tokio-util.workspace = true +anyhow.workspace = true +postgres.workspace = true +bytes.workspace = true diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs new file mode 100644 index 0000000000..a981b5020e --- /dev/null +++ b/storage_controller/client/src/control_api.rs @@ -0,0 +1,62 @@ +use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; +use reqwest::{Method, Url}; +use serde::{de::DeserializeOwned, Serialize}; +use std::str::FromStr; + +pub struct Client { + base_url: Url, + jwt_token: Option, + client: reqwest::Client, +} + +impl Client { + pub fn new(base_url: Url, jwt_token: Option) -> Self { + Self { + base_url, + jwt_token, + client: reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"), + } + } + + /// Simple HTTP request wrapper for calling into storage controller + pub async fn dispatch( + &self, + method: Method, + path: String, + body: Option, + ) -> mgmt_api::Result + where + RQ: Serialize + Sized, + RS: DeserializeOwned + Sized, + { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let url = Url::from_str(&format!( + "http://{}:{}/{path}", + self.base_url.host_str().unwrap(), + self.base_url.port().unwrap() + )) + .unwrap(); + + let mut builder = self.client.request(method, url); + if let Some(body) = body { + builder = builder.json(&body) + } + if let Some(jwt_token) = &self.jwt_token { + builder = builder.header( + reqwest::header::AUTHORIZATION, + format!("Bearer {jwt_token}"), + ); + } + + let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?; + let response = response.error_from_body().await?; + + response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody) + } +} diff --git a/storage_controller/client/src/lib.rs b/storage_controller/client/src/lib.rs new file mode 100644 index 0000000000..6d5e202942 --- /dev/null +++ b/storage_controller/client/src/lib.rs @@ -0,0 +1 @@ +pub mod control_api; diff --git a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql new file mode 100644 index 0000000000..1ecfc8786f --- /dev/null +++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql @@ -0,0 +1 @@ +DROP TABLE metadata_health; \ No newline at end of file diff --git a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql new file mode 100644 index 0000000000..fa87eda119 --- /dev/null +++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql @@ -0,0 +1,14 @@ +CREATE TABLE metadata_health ( + tenant_id VARCHAR NOT NULL, + shard_number INTEGER NOT NULL, + shard_count INTEGER NOT NULL, + PRIMARY KEY(tenant_id, shard_number, shard_count), + -- Rely on cascade behavior for delete + FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE, + healthy BOOLEAN NOT NULL DEFAULT TRUE, + last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + + +INSERT INTO metadata_health(tenant_id, shard_number, shard_count) +SELECT tenant_id, shard_number, shard_count FROM tenant_shards; diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql new file mode 100644 index 0000000000..53222c614e --- /dev/null +++ b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql @@ -0,0 +1 @@ +DROP TABLE controllers; diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql new file mode 100644 index 0000000000..90546948cb --- /dev/null +++ b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql @@ -0,0 +1,5 @@ +CREATE TABLE controllers ( + address VARCHAR NOT NULL, + started_at TIMESTAMPTZ NOT NULL, + PRIMARY KEY(address, started_at) +); diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs index 74b7e7c849..6f1355eb68 100644 --- a/storage_controller/src/background_node_operations.rs +++ b/storage_controller/src/background_node_operations.rs @@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display}; use tokio_util::sync::CancellationToken; use utils::id::NodeId; -pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 10; +pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32; #[derive(Copy, Clone)] pub(crate) struct Drain { diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index a1d051f150..c46539485c 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -146,6 +146,9 @@ pub(crate) enum NotifyError { // A response indicates we will never succeed, such as 400 or 404 #[error("Non-retryable error {0}")] Fatal(StatusCode), + + #[error("neon_local error: {0}")] + NeonLocal(anyhow::Error), } enum MaybeSendResult { @@ -278,7 +281,7 @@ impl ComputeHook { async fn do_notify_local( &self, reconfigure_request: &ComputeHookNotifyRequest, - ) -> anyhow::Result<()> { + ) -> Result<(), NotifyError> { // neon_local updates are not safe to call concurrently, use a lock to serialize // all calls to this function let _locked = self.neon_local_lock.lock().await; @@ -320,8 +323,9 @@ impl ComputeHook { if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { tracing::info!("Reconfiguring endpoint {}", endpoint_name,); endpoint - .reconfigure(compute_pageservers.clone(), *stripe_size) - .await?; + .reconfigure(compute_pageservers.clone(), *stripe_size, None) + .await + .map_err(NotifyError::NeonLocal)?; } } @@ -510,7 +514,7 @@ impl ComputeHook { } else { self.do_notify_local(&request).await.map_err(|e| { // This path is for testing only, so munge the error into our prod-style error type. - tracing::error!("Local notification hook failed: {e}"); + tracing::error!("neon_local notification hook failed: {e}"); NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) }) }; diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs new file mode 100644 index 0000000000..dea1f04649 --- /dev/null +++ b/storage_controller/src/drain_utils.rs @@ -0,0 +1,225 @@ +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, +}; + +use pageserver_api::controller_api::NodeSchedulingPolicy; +use utils::{id::NodeId, shard::TenantShardId}; + +use crate::{ + background_node_operations::OperationError, node::Node, scheduler::Scheduler, + tenant_shard::TenantShard, +}; + +pub(crate) struct TenantShardIterator { + tenants_accessor: F, + inspected_all_shards: bool, + last_inspected_shard: Option, +} + +/// A simple iterator which can be used in tandem with [`crate::service::Service`] +/// to iterate over all known tenant shard ids without holding the lock on the +/// service state at all times. +impl TenantShardIterator +where + F: Fn(Option) -> Option, +{ + pub(crate) fn new(tenants_accessor: F) -> Self { + Self { + tenants_accessor, + inspected_all_shards: false, + last_inspected_shard: None, + } + } + + /// Returns the next tenant shard id if one exists + pub(crate) fn next(&mut self) -> Option { + if self.inspected_all_shards { + return None; + } + + match (self.tenants_accessor)(self.last_inspected_shard) { + Some(tid) => { + self.last_inspected_shard = Some(tid); + Some(tid) + } + None => { + self.inspected_all_shards = true; + None + } + } + } + + /// Returns true when the end of the iterator is reached and false otherwise + pub(crate) fn finished(&self) -> bool { + self.inspected_all_shards + } +} + +/// Check that the state of the node being drained is as expected: +/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`] +pub(crate) fn validate_node_state( + node_id: &NodeId, + nodes: Arc>, +) -> Result<(), OperationError> { + let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged( + format!("node {} was removed", node_id).into(), + ))?; + + let current_policy = node.get_scheduling(); + if !matches!(current_policy, NodeSchedulingPolicy::Draining) { + // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think + // about it + return Err(OperationError::NodeStateChanged( + format!("node {} changed state to {:?}", node_id, current_policy).into(), + )); + } + + Ok(()) +} + +/// Struct that houses a few utility methods for draining pageserver nodes +pub(crate) struct TenantShardDrain { + pub(crate) drained_node: NodeId, + pub(crate) tenant_shard_id: TenantShardId, +} + +impl TenantShardDrain { + /// Check if the tenant shard under question is eligible for drainining: + /// it's primary attachment is on the node being drained + pub(crate) fn tenant_shard_eligible_for_drain( + &self, + tenants: &BTreeMap, + scheduler: &Scheduler, + ) -> Option { + let tenant_shard = tenants.get(&self.tenant_shard_id)?; + + if *tenant_shard.intent.get_attached() != Some(self.drained_node) { + return None; + } + + match scheduler.node_preferred(tenant_shard.intent.get_secondary()) { + Some(node) => Some(node), + None => { + tracing::warn!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "No eligible secondary while draining {}", self.drained_node + ); + + None + } + } + } + + /// Attempt to reschedule the tenant shard under question to one of its secondary locations + /// Returns an Err when the operation should be aborted and Ok(None) when the tenant shard + /// should be skipped. + pub(crate) fn reschedule_to_secondary<'a>( + &self, + destination: NodeId, + tenants: &'a mut BTreeMap, + scheduler: &mut Scheduler, + nodes: &Arc>, + ) -> Result, OperationError> { + let tenant_shard = match tenants.get_mut(&self.tenant_shard_id) { + Some(some) => some, + None => { + // Tenant shard was removed in the meantime. + // Skip to the next one, but don't fail the overall operation + return Ok(None); + } + }; + + if !nodes.contains_key(&destination) { + return Err(OperationError::NodeStateChanged( + format!("node {} was removed", destination).into(), + )); + } + + if !tenant_shard.intent.get_secondary().contains(&destination) { + tracing::info!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Secondary moved away from {destination} during drain" + ); + + return Ok(None); + } + + match tenant_shard.reschedule_to_secondary(Some(destination), scheduler) { + Err(e) => { + tracing::warn!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Scheduling error when draining pageserver {} : {}", self.drained_node, e + ); + + Ok(None) + } + Ok(()) => { + let scheduled_to = tenant_shard.intent.get_attached(); + tracing::info!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Rescheduled shard while draining node {}: {} -> {:?}", + self.drained_node, + self.drained_node, + scheduled_to + ); + + Ok(Some(tenant_shard)) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use utils::{ + id::TenantId, + shard::{ShardCount, ShardNumber, TenantShardId}, + }; + + use super::TenantShardIterator; + + #[test] + fn test_tenant_shard_iterator() { + let tenant_id = TenantId::generate(); + let shard_count = ShardCount(8); + + let mut tenant_shards = Vec::default(); + for i in 0..shard_count.0 { + tenant_shards.push(( + TenantShardId { + tenant_id, + shard_number: ShardNumber(i), + shard_count, + }, + (), + )) + } + + let tenant_shards = Arc::new(tenant_shards); + + let mut tid_iter = TenantShardIterator::new({ + let tenants = tenant_shards.clone(); + move |last_inspected_shard: Option| { + let entry = match last_inspected_shard { + Some(skip_past) => { + let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past); + cursor.nth(1) + } + None => tenants.first(), + }; + + entry.map(|(tid, _)| tid).copied() + } + }); + + let mut iterated_over = Vec::default(); + while let Some(tid) = tid_iter.next() { + iterated_over.push((tid, ())); + } + + assert_eq!(iterated_over, *tenant_shards); + } +} diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 14cda0a289..c0e27bafdb 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -22,7 +22,8 @@ struct HeartbeaterTask { state: HashMap, - max_unavailable_interval: Duration, + max_offline_interval: Duration, + max_warming_up_interval: Duration, jwt_token: Option, } @@ -31,7 +32,9 @@ pub(crate) enum PageserverState { Available { last_seen_at: Instant, utilization: PageserverUtilization, - new: bool, + }, + WarmingUp { + started_at: Instant, }, Offline, } @@ -57,12 +60,18 @@ pub(crate) struct Heartbeater { impl Heartbeater { pub(crate) fn new( jwt_token: Option, - max_unavailable_interval: Duration, + max_offline_interval: Duration, + max_warming_up_interval: Duration, cancel: CancellationToken, ) -> Self { let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::(); - let mut heartbeater = - HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel); + let mut heartbeater = HeartbeaterTask::new( + receiver, + jwt_token, + max_offline_interval, + max_warming_up_interval, + cancel, + ); tokio::task::spawn(async move { heartbeater.run().await }); Self { sender } @@ -78,9 +87,12 @@ impl Heartbeater { pageservers, reply: sender, }) - .unwrap(); + .map_err(|_| HeartbeaterError::Cancel)?; - receiver.await.unwrap() + receiver + .await + .map_err(|_| HeartbeaterError::Cancel) + .and_then(|x| x) } } @@ -88,14 +100,16 @@ impl HeartbeaterTask { fn new( receiver: tokio::sync::mpsc::UnboundedReceiver, jwt_token: Option, - max_unavailable_interval: Duration, + max_offline_interval: Duration, + max_warming_up_interval: Duration, cancel: CancellationToken, ) -> Self { Self { receiver, cancel, state: HashMap::new(), - max_unavailable_interval, + max_offline_interval, + max_warming_up_interval, jwt_token, } } @@ -128,16 +142,15 @@ impl HeartbeaterTask { heartbeat_futs.push({ let jwt_token = self.jwt_token.clone(); let cancel = self.cancel.clone(); - let new_node = !self.state.contains_key(node_id); // Clone the node and mark it as available such that the request // goes through to the pageserver even when the node is marked offline. // This doesn't impact the availability observed by [`crate::service::Service`]. - let mut node = node.clone(); - node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + let mut node_clone = node.clone(); + node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst())); async move { - let response = node + let response = node_clone .with_client_retries( |client| async move { client.get_utilization().await }, &jwt_token, @@ -161,7 +174,12 @@ impl HeartbeaterTask { PageserverState::Available { last_seen_at: Instant::now(), utilization, - new: new_node, + } + } else if let NodeAvailability::WarmingUp(last_seen_at) = + node.get_availability() + { + PageserverState::WarmingUp { + started_at: last_seen_at, } } else { PageserverState::Offline @@ -187,53 +205,67 @@ impl HeartbeaterTask { } } } + + let mut warming_up = 0; + let mut offline = 0; + for state in new_state.values() { + match state { + PageserverState::WarmingUp { .. } => { + warming_up += 1; + } + PageserverState::Offline { .. } => offline += 1, + PageserverState::Available { .. } => {} + } + } + tracing::info!( - "Heartbeat round complete for {} nodes, {} offline", + "Heartbeat round complete for {} nodes, {} warming-up, {} offline", new_state.len(), - new_state - .values() - .filter(|s| match s { - PageserverState::Available { .. } => { - false - } - PageserverState::Offline => true, - }) - .count() + warming_up, + offline ); let mut deltas = Vec::new(); let now = Instant::now(); - for (node_id, ps_state) in new_state { + for (node_id, ps_state) in new_state.iter_mut() { use std::collections::hash_map::Entry::*; - let entry = self.state.entry(node_id); + let entry = self.state.entry(*node_id); let mut needs_update = false; match entry { Occupied(ref occ) => match (occ.get(), &ps_state) { (PageserverState::Offline, PageserverState::Offline) => {} (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => { - if now - *last_seen_at >= self.max_unavailable_interval { - deltas.push((node_id, ps_state.clone())); + if now - *last_seen_at >= self.max_offline_interval { + deltas.push((*node_id, ps_state.clone())); needs_update = true; } } + (_, PageserverState::WarmingUp { started_at }) => { + if now - *started_at >= self.max_warming_up_interval { + *ps_state = PageserverState::Offline; + } + + deltas.push((*node_id, ps_state.clone())); + needs_update = true; + } _ => { - deltas.push((node_id, ps_state.clone())); + deltas.push((*node_id, ps_state.clone())); needs_update = true; } }, Vacant(_) => { // This is a new node. Don't generate a delta for it. - deltas.push((node_id, ps_state.clone())); + deltas.push((*node_id, ps_state.clone())); } } match entry { Occupied(mut occ) if needs_update => { - (*occ.get_mut()) = ps_state; + (*occ.get_mut()) = ps_state.clone(); } Vacant(vac) => { - vac.insert(ps_state); + vac.insert(ps_state.clone()); } _ => {} } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 3e9951fb9e..7bbd1541cf 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -3,15 +3,20 @@ use crate::metrics::{ METRICS_REGISTRY, }; use crate::reconciler::ReconcileError; -use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; +use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT}; use anyhow::Context; use futures::Future; use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; use metrics::{BuildInfo, NeonMetrics}; +use pageserver_api::controller_api::{ + MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse, + MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, + TenantCreateRequest, +}; use pageserver_api::models::{ - TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, + TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; @@ -329,6 +334,22 @@ async fn handle_tenant_timeline_delete( .await } +async fn handle_tenant_timeline_detach_ancestor( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let res = service + .tenant_timeline_detach_ancestor(tenant_id, timeline_id) + .await?; + + json_response(StatusCode::OK, res) +} + async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, @@ -413,7 +434,7 @@ async fn handle_tenant_describe( service: Arc, req: Request, ) -> Result, ApiError> { - check_permissions(&req, Scope::Admin)?; + check_permissions(&req, Scope::Scrubber)?; let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) @@ -455,6 +476,14 @@ async fn handle_node_drop(req: Request) -> Result, ApiError json_response(StatusCode::OK, state.service.node_drop(node_id).await?) } +async fn handle_node_delete(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + json_response(StatusCode::OK, state.service.node_delete(node_id).await?) +} + async fn handle_node_configure(mut req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -471,7 +500,7 @@ async fn handle_node_configure(mut req: Request) -> Result, StatusCode::OK, state .service - .node_configure( + .external_node_configure( config_req.node_id, config_req.availability.map(NodeAvailability::from), config_req.scheduling, @@ -491,6 +520,19 @@ async fn handle_node_status(req: Request) -> Result, ApiErr json_response(StatusCode::OK, node_status) } +async fn handle_get_leader(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let leader = state.service.get_leader().await.map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to read leader from database: {err}" + )) + })?; + + json_response(StatusCode::OK, leader) +} + async fn handle_node_drain(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -502,6 +544,17 @@ async fn handle_node_drain(req: Request) -> Result, ApiErro json_response(StatusCode::ACCEPTED, ()) } +async fn handle_cancel_node_drain(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + state.service.cancel_node_drain(node_id).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + async fn handle_node_fill(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -513,6 +566,62 @@ async fn handle_node_fill(req: Request) -> Result, ApiError json_response(StatusCode::ACCEPTED, ()) } +async fn handle_cancel_node_fill(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + state.service.cancel_node_fill(node_id).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn handle_metadata_health_update(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Scrubber)?; + + let update_req = json_request::(&mut req).await?; + let state = get_state(&req); + + state.service.metadata_health_update(update_req).await?; + + json_response(StatusCode::OK, MetadataHealthUpdateResponse {}) +} + +async fn handle_metadata_health_list_unhealthy( + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?; + + json_response( + StatusCode::OK, + MetadataHealthListUnhealthyResponse { + unhealthy_tenant_shards, + }, + ) +} + +async fn handle_metadata_health_list_outdated( + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let list_outdated_req = json_request::(&mut req).await?; + let state = get_state(&req); + let health_records = state + .service + .metadata_health_list_outdated(list_outdated_req.not_scrubbed_for) + .await?; + + json_response( + StatusCode::OK, + MetadataHealthListOutdatedResponse { health_records }, + ) +} + async fn handle_tenant_shard_split( service: Arc, mut req: Request, @@ -560,6 +669,13 @@ async fn handle_tenant_update_policy(mut req: Request) -> Result) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + json_response(StatusCode::OK, state.service.step_down().await) +} + async fn handle_tenant_drop(req: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; @@ -687,6 +803,47 @@ struct RequestMeta { at: Instant, } +pub fn prologue_leadership_status_check_middleware< + B: hyper::body::HttpBody + Send + Sync + 'static, +>() -> Middleware { + Middleware::pre(move |req| async move { + let state = get_state(&req); + let leadership_status = state.service.get_leadership_status(); + + enum AllowedRoutes<'a> { + All, + Some(Vec<&'a str>), + } + + let allowed_routes = match leadership_status { + LeadershipStatus::Leader => AllowedRoutes::All, + LeadershipStatus::SteppedDown => { + // TODO: does it make sense to allow /status here? + AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec()) + } + LeadershipStatus::Candidate => { + AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) + } + }; + + let uri = req.uri().to_string(); + match allowed_routes { + AllowedRoutes::All => Ok(req), + AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req), + _ => { + tracing::info!( + "Request {} not allowed due to current leadership state", + req.uri() + ); + + Err(ApiError::ResourceUnavailable( + format!("Current leadership status is {leadership_status}").into(), + )) + } + } + }) +} + fn prologue_metrics_middleware( ) -> Middleware { Middleware::pre(move |req| async move { @@ -773,6 +930,7 @@ pub fn make_router( build_info: BuildInfo, ) -> RouterBuilder { let mut router = endpoint::make_router() + .middleware(prologue_leadership_status_check_middleware()) .middleware(prologue_metrics_middleware()) .middleware(epilogue_metrics_middleware()); if auth.is_some() { @@ -855,6 +1013,9 @@ pub fn make_router( .post("/control/v1/node", |r| { named_request_span(r, handle_node_register, RequestName("control_v1_node")) }) + .delete("/control/v1/node/:node_id", |r| { + named_request_span(r, handle_node_delete, RequestName("control_v1_node_delete")) + }) .get("/control/v1/node", |r| { named_request_span(r, handle_node_list, RequestName("control_v1_node")) }) @@ -868,12 +1029,51 @@ pub fn make_router( .get("/control/v1/node/:node_id", |r| { named_request_span(r, handle_node_status, RequestName("control_v1_node_status")) }) + .get("/control/v1/leader", |r| { + named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader")) + }) .put("/control/v1/node/:node_id/drain", |r| { named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain")) }) + .delete("/control/v1/node/:node_id/drain", |r| { + named_request_span( + r, + handle_cancel_node_drain, + RequestName("control_v1_cancel_node_drain"), + ) + }) .put("/control/v1/node/:node_id/fill", |r| { named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill")) }) + .delete("/control/v1/node/:node_id/fill", |r| { + named_request_span( + r, + handle_cancel_node_fill, + RequestName("control_v1_cancel_node_fill"), + ) + }) + // Metadata health operations + .post("/control/v1/metadata_health/update", |r| { + named_request_span( + r, + handle_metadata_health_update, + RequestName("control_v1_metadata_health_update"), + ) + }) + .get("/control/v1/metadata_health/unhealthy", |r| { + named_request_span( + r, + handle_metadata_health_list_unhealthy, + RequestName("control_v1_metadata_health_list_unhealthy"), + ) + }) + .post("/control/v1/metadata_health/outdated", |r| { + named_request_span( + r, + handle_metadata_health_list_outdated, + RequestName("control_v1_metadata_health_list_outdated"), + ) + }) // TODO(vlad): endpoint for cancelling drain and fill // Tenant Shard operations .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| { @@ -907,6 +1107,9 @@ pub fn make_router( RequestName("control_v1_tenant_policy"), ) }) + .put("/control/v1/step_down", |r| { + named_request_span(r, handle_step_down, RequestName("control_v1_step_down")) + }) // Tenant operations // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. @@ -958,6 +1161,16 @@ pub fn make_router( RequestName("v1_tenant_timeline"), ) }) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_detach_ancestor, + RequestName("v1_tenant_timeline_detach_ancestor"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs index dff793289f..fcd3eb57e2 100644 --- a/storage_controller/src/id_lock_map.rs +++ b/storage_controller/src/id_lock_map.rs @@ -8,14 +8,15 @@ use crate::service::RECONCILE_TIMEOUT; const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT; -/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the -/// current holding operation in lock. -pub struct WrappedWriteGuard { +/// A wrapper around `OwnedRwLockWriteGuard` used for tracking the +/// operation that holds the lock, and print a warning if it exceeds +/// the LOCK_TIMEOUT_ALERT_THRESHOLD time +pub struct TracingExclusiveGuard { guard: tokio::sync::OwnedRwLockWriteGuard>, start: Instant, } -impl WrappedWriteGuard { +impl TracingExclusiveGuard { pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard>) -> Self { Self { guard, @@ -24,12 +25,12 @@ impl WrappedWriteGuard { } } -impl Drop for WrappedWriteGuard { +impl Drop for TracingExclusiveGuard { fn drop(&mut self) { let duration = self.start.elapsed(); if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { tracing::warn!( - "Lock on {} was held for {:?}", + "Exclusive lock by {} was held for {:?}", self.guard.as_ref().unwrap(), duration ); @@ -38,6 +39,38 @@ impl Drop for WrappedWriteGuard { } } +// A wrapper around `OwnedRwLockReadGuard` used for tracking the +/// operation that holds the lock, and print a warning if it exceeds +/// the LOCK_TIMEOUT_ALERT_THRESHOLD time +pub struct TracingSharedGuard { + _guard: tokio::sync::OwnedRwLockReadGuard>, + operation: T, + start: Instant, +} + +impl TracingSharedGuard { + pub fn new(guard: tokio::sync::OwnedRwLockReadGuard>, operation: T) -> Self { + Self { + _guard: guard, + operation, + start: Instant::now(), + } + } +} + +impl Drop for TracingSharedGuard { + fn drop(&mut self) { + let duration = self.start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Shared lock by {} was held for {:?}", + self.operation, + duration + ); + } + } +} + /// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't /// want to embed a lock in each one, or if your locking granularity is different to your object granularity. /// For example, used in the storage controller where the objects are tenant shards, but sometimes locking @@ -58,21 +91,22 @@ where pub(crate) fn shared( &self, key: T, - ) -> impl std::future::Future>> { + operation: I, + ) -> impl std::future::Future> { let mut locked = self.entities.lock().unwrap(); - let entry = locked.entry(key).or_default(); - entry.clone().read_owned() + let entry = locked.entry(key).or_default().clone(); + async move { TracingSharedGuard::new(entry.read_owned().await, operation) } } pub(crate) fn exclusive( &self, key: T, operation: I, - ) -> impl std::future::Future> { + ) -> impl std::future::Future> { let mut locked = self.entities.lock().unwrap(); let entry = locked.entry(key).or_default().clone(); async move { - let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await); + let mut guard = TracingExclusiveGuard::new(entry.write_owned().await); *guard.guard = Some(operation); guard } @@ -99,12 +133,12 @@ where pub async fn trace_exclusive_lock< T: Clone + Display + Eq + PartialEq + std::hash::Hash, - I: Display + Clone, + I: Clone + Display, >( op_locks: &IdLockMap, key: T, operation: I, -) -> WrappedWriteGuard { +) -> TracingExclusiveGuard { let start = Instant::now(); let guard = op_locks.exclusive(key.clone(), operation.clone()).await; @@ -123,14 +157,14 @@ pub async fn trace_exclusive_lock< pub async fn trace_shared_lock< T: Clone + Display + Eq + PartialEq + std::hash::Hash, - I: Display, + I: Clone + Display, >( op_locks: &IdLockMap, key: T, operation: I, -) -> tokio::sync::OwnedRwLockReadGuard> { +) -> TracingSharedGuard { let start = Instant::now(); - let guard = op_locks.shared(key.clone()).await; + let guard = op_locks.shared(key.clone(), operation.clone()).await; let duration = start.elapsed(); if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { @@ -159,11 +193,11 @@ mod tests { async fn multiple_shared_locks() { let id_lock_map: IdLockMap = IdLockMap::default(); - let shared_lock_1 = id_lock_map.shared(1).await; - let shared_lock_2 = id_lock_map.shared(1).await; + let shared_lock_1 = id_lock_map.shared(1, Operations::Op1).await; + let shared_lock_2 = id_lock_map.shared(1, Operations::Op2).await; - assert!(shared_lock_1.is_none()); - assert!(shared_lock_2.is_none()); + assert_eq!(shared_lock_1.operation, Operations::Op1); + assert_eq!(shared_lock_2.operation, Operations::Op2); } #[tokio::test] @@ -183,7 +217,7 @@ mod tests { assert!(_ex_lock_2.is_err()); } - let shared_lock_1 = id_lock_map.shared(resource_id).await; - assert!(shared_lock_1.is_none()); + let shared_lock_1 = id_lock_map.shared(resource_id, Operations::Op1).await; + assert_eq!(shared_lock_1.operation, Operations::Op1); } } diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs new file mode 100644 index 0000000000..5fae8991ec --- /dev/null +++ b/storage_controller/src/leadership.rs @@ -0,0 +1,135 @@ +use std::sync::Arc; + +use hyper::Uri; +use tokio_util::sync::CancellationToken; + +use crate::{ + peer_client::{GlobalObservedState, PeerClient}, + persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence}, + service::Config, +}; + +/// Helper for storage controller leadership acquisition +pub(crate) struct Leadership { + persistence: Arc, + config: Config, + cancel: CancellationToken, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum Error { + #[error(transparent)] + Database(#[from] DatabaseError), +} + +pub(crate) type Result = std::result::Result; + +impl Leadership { + pub(crate) fn new( + persistence: Arc, + config: Config, + cancel: CancellationToken, + ) -> Self { + Self { + persistence, + config, + cancel, + } + } + + /// Find the current leader in the database and request it to step down if required. + /// Should be called early on in within the start-up sequence. + /// + /// Returns a tuple of two optionals: the current leader and its observed state + pub(crate) async fn step_down_current_leader( + &self, + ) -> Result<(Option, Option)> { + let leader = self.current_leader().await?; + let leader_step_down_state = if let Some(ref leader) = leader { + if self.config.start_as_candidate { + self.request_step_down(leader).await + } else { + None + } + } else { + tracing::info!("No leader found to request step down from. Will build observed state."); + None + }; + + Ok((leader, leader_step_down_state)) + } + + /// Mark the current storage controller instance as the leader in the database + pub(crate) async fn become_leader( + &self, + current_leader: Option, + ) -> Result<()> { + if let Some(address_for_peers) = &self.config.address_for_peers { + // TODO: `address-for-peers` can become a mandatory cli arg + // after we update the k8s setup + let proposed_leader = ControllerPersistence { + address: address_for_peers.to_string(), + started_at: chrono::Utc::now(), + }; + + self.persistence + .update_leader(current_leader, proposed_leader) + .await + .map_err(Error::Database) + } else { + tracing::info!("No address-for-peers provided. Skipping leader persistence."); + Ok(()) + } + } + + async fn current_leader(&self) -> DatabaseResult> { + let res = self.persistence.get_leader().await; + if let Err(DatabaseError::Query(diesel::result::Error::DatabaseError(_kind, ref err))) = res + { + const REL_NOT_FOUND_MSG: &str = "relation \"controllers\" does not exist"; + if err.message().trim() == REL_NOT_FOUND_MSG { + // Special case: if this is a brand new storage controller, migrations will not + // have run at this point yet, and, hence, the controllers table does not exist. + // Detect this case via the error string (diesel doesn't type it) and allow it. + tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ..."); + return Ok(None); + } + } + + res + } + + /// Request step down from the currently registered leader in the database + /// + /// If such an entry is persisted, the success path returns the observed + /// state and details of the leader. Otherwise, None is returned indicating + /// there is no leader currently. + async fn request_step_down( + &self, + leader: &ControllerPersistence, + ) -> Option { + tracing::info!("Sending step down request to {leader:?}"); + + let client = PeerClient::new( + Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"), + self.config.peer_jwt_token.clone(), + ); + let state = client.step_down(&self.cancel).await; + match state { + Ok(state) => Some(state), + Err(err) => { + // TODO: Make leaders periodically update a timestamp field in the + // database and, if the leader is not reachable from the current instance, + // but inferred as alive from the timestamp, abort start-up. This avoids + // a potential scenario in which we have two controllers acting as leaders. + tracing::error!( + "Leader ({}) did not respond to step-down request: {}", + leader.address, + err + ); + + None + } + } + } +} diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs index 8caf638904..60e613bb5c 100644 --- a/storage_controller/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -4,12 +4,15 @@ use utils::seqwait::MonotonicCounter; mod auth; mod background_node_operations; mod compute_hook; +mod drain_utils; mod heartbeater; pub mod http; mod id_lock_map; +mod leadership; pub mod metrics; mod node; mod pageserver_client; +mod peer_client; pub mod persistence; mod reconciler; mod scheduler; diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index f1eb0b30fc..e3f29b84e7 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,19 +1,22 @@ use anyhow::{anyhow, Context}; -use camino::Utf8PathBuf; use clap::Parser; -use diesel::Connection; +use hyper::Uri; use metrics::launch_timestamp::LaunchTimestamp; use metrics::BuildInfo; use std::path::PathBuf; use std::sync::Arc; +use std::time::Duration; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; +use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, + RECONCILER_CONCURRENCY_DEFAULT, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; +use tracing::Instrument; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; @@ -23,9 +26,6 @@ use utils::{project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); -use diesel_migrations::{embed_migrations, EmbeddedMigrations}; -pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations"); - #[derive(Parser)] #[command(author, version, about, long_about = None)] #[command(arg_required_else_help(true))] @@ -47,14 +47,13 @@ struct Cli { #[arg(long)] control_plane_jwt_token: Option, + #[arg(long)] + peer_jwt_token: Option, + /// URL to control plane compute notification endpoint #[arg(long)] compute_hook_url: Option, - /// Path to the .json file to store state (will be created if it doesn't exist) - #[arg(short, long)] - path: Option, - /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller #[arg(long)] database_url: Option, @@ -65,7 +64,12 @@ struct Cli { /// Grace period before marking unresponsive pageserver offline #[arg(long)] - max_unavailable_interval: Option, + max_offline_interval: Option, + + /// More tolerant grace period before marking unresponsive pagserver offline used + /// around pageserver restarts + #[arg(long)] + max_warming_up_interval: Option, /// Size threshold for automatically splitting shards (disabled by default) #[arg(long)] @@ -79,11 +83,27 @@ struct Cli { #[arg(long, default_value = "5s")] db_connect_timeout: humantime::Duration, + #[arg(long, default_value = "false")] + start_as_candidate: bool, + + // TODO: make this mandatory once the helm chart gets updated + #[arg(long)] + address_for_peers: Option, + /// `neon_local` sets this to the path of the neon_local repo dir. /// Only relevant for testing. // TODO: make `cfg(feature = "testing")` #[arg(long)] neon_local_repo_dir: Option, + + /// Chaos testing + #[arg(long)] + chaos_interval: Option, + + // Maximum acceptable lag for the secondary location while draining + // a pageserver + #[arg(long)] + max_secondary_lag_bytes: Option, } enum StrictMode { @@ -109,28 +129,28 @@ struct Secrets { public_key: Option, jwt_token: Option, control_plane_jwt_token: Option, + peer_jwt_token: Option, } impl Secrets { const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN"; + const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN"; const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY"; /// Load secrets from, in order of preference: /// - CLI args if database URL is provided on the CLI /// - Environment variables if DATABASE_URL is set. - /// - AWS Secrets Manager secrets async fn load(args: &Cli) -> anyhow::Result { - let Some(database_url) = - Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await + let Some(database_url) = Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV) else { anyhow::bail!( "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)" ) }; - let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await { + let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV) { Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?), None => None, }; @@ -138,18 +158,18 @@ impl Secrets { let this = Self { database_url, public_key, - jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await, + jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV), control_plane_jwt_token: Self::load_secret( &args.control_plane_jwt_token, Self::CONTROL_PLANE_JWT_TOKEN_ENV, - ) - .await, + ), + peer_jwt_token: Self::load_secret(&args.peer_jwt_token, Self::PEER_JWT_TOKEN_ENV), }; Ok(this) } - async fn load_secret(cli: &Option, env_name: &str) -> Option { + fn load_secret(cli: &Option, env_name: &str) -> Option { if let Some(v) = cli { Some(v.clone()) } else if let Ok(v) = std::env::var(env_name) { @@ -160,29 +180,27 @@ impl Secrets { } } -/// Execute the diesel migrations that are built into this binary -async fn migration_run(database_url: &str) -> anyhow::Result<()> { - use diesel::PgConnection; - use diesel_migrations::{HarnessWithOutput, MigrationHarness}; - let mut conn = PgConnection::establish(database_url)?; - - HarnessWithOutput::write_to_stdout(&mut conn) - .run_pending_migrations(MIGRATIONS) - .map(|_| ()) - .map_err(|e| anyhow::anyhow!(e))?; - - Ok(()) -} - fn main() -> anyhow::Result<()> { - let default_panic = std::panic::take_hook(); - std::panic::set_hook(Box::new(move |info| { - default_panic(info); - std::process::exit(1); - })); + logging::init( + LogFormat::Plain, + logging::TracingErrorLayerEnablement::Disabled, + logging::Output::Stdout, + )?; + + // log using tracing so we don't get confused output by default hook writing to stderr + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + let hook = std::panic::take_hook(); + std::panic::set_hook(Box::new(move |info| { + // let sentry send a message (and flush) + // and trace the error + hook(info); + + std::process::exit(1); + })); + tokio::runtime::Builder::new_current_thread() // We use spawn_blocking for database operations, so require approximately // as many blocking threads as we will open database connections. @@ -196,21 +214,14 @@ fn main() -> anyhow::Result<()> { async fn async_main() -> anyhow::Result<()> { let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate())); - logging::init( - LogFormat::Plain, - logging::TracingErrorLayerEnablement::Disabled, - logging::Output::Stdout, - )?; - preinitialize_metrics(); let args = Cli::parse(); tracing::info!( - "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}", + "version: {}, launch_timestamp: {}, build_tag {}, listening on {}", GIT_VERSION, launch_ts.to_string(), BUILD_TAG, - args.path.as_ref().unwrap_or(&Utf8PathBuf::from("")), args.listen ); @@ -258,27 +269,31 @@ async fn async_main() -> anyhow::Result<()> { let config = Config { jwt_token: secrets.jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, + peer_jwt_token: secrets.peer_jwt_token, compute_hook_url: args.compute_hook_url, - max_unavailable_interval: args - .max_unavailable_interval + max_offline_interval: args + .max_offline_interval .map(humantime::Duration::into) - .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT), + .unwrap_or(MAX_OFFLINE_INTERVAL_DEFAULT), + max_warming_up_interval: args + .max_warming_up_interval + .map(humantime::Duration::into) + .unwrap_or(MAX_WARMING_UP_INTERVAL_DEFAULT), reconciler_concurrency: args .reconciler_concurrency .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, + max_secondary_lag_bytes: args.max_secondary_lag_bytes, + address_for_peers: args.address_for_peers, + start_as_candidate: args.start_as_candidate, + http_service_port: args.listen.port() as i32, }; - // After loading secrets & config, but before starting anything else, apply database migrations + // Validate that we can connect to the database Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?; - migration_run(&secrets.database_url) - .await - .context("Running database migrations")?; - - let json_path = args.path; - let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone())); + let persistence = Arc::new(Persistence::new(secrets.database_url)); let service = Service::spawn(config, persistence.clone()).await?; @@ -305,6 +320,22 @@ async fn async_main() -> anyhow::Result<()> { tracing::info!("Serving on {0}", args.listen); let server_task = tokio::task::spawn(server); + let chaos_task = args.chaos_interval.map(|interval| { + let service = service.clone(); + let cancel = CancellationToken::new(); + let cancel_bg = cancel.clone(); + ( + tokio::task::spawn( + async move { + let mut chaos_injector = ChaosInjector::new(service, interval.into()); + chaos_injector.run(cancel_bg).await + } + .instrument(tracing::info_span!("chaos_injector")), + ), + cancel, + ) + }); + // Wait until we receive a signal let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?; let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?; @@ -316,21 +347,28 @@ async fn async_main() -> anyhow::Result<()> { } tracing::info!("Terminating on signal"); - if json_path.is_some() { - // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing - // full postgres dumps around. - if let Err(e) = persistence.write_tenants_json().await { - tracing::error!("Failed to write JSON on shutdown: {e}") + // Stop HTTP server first, so that we don't have to service requests + // while shutting down Service. + server_shutdown.cancel(); + match tokio::time::timeout(Duration::from_secs(5), server_task).await { + Ok(Ok(_)) => { + tracing::info!("Joined HTTP server task"); + } + Ok(Err(e)) => { + tracing::error!("Error joining HTTP server task: {e}") + } + Err(_) => { + tracing::warn!("Timed out joining HTTP server task"); + // We will fall through and shut down the service anyway, any request handlers + // in flight will experience cancellation & their clients will see a torn connection. } } - // Stop HTTP server first, so that we don't have to service requests - // while shutting down Service - server_shutdown.cancel(); - if let Err(e) = server_task.await { - tracing::error!("Error joining HTTP server task: {e}") + // If we were injecting chaos, stop that so that we're not calling into Service while it shuts down + if let Some((chaos_jh, chaos_cancel)) = chaos_task { + chaos_cancel.cancel(); + chaos_jh.await.ok(); } - tracing::info!("Joined HTTP server task"); service.shutdown().await; tracing::info!("Service shutdown complete"); diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index ac9f22c739..5cfcfb4b1f 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -12,8 +12,12 @@ use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, Metr use metrics::NeonMetrics; use once_cell::sync::Lazy; use std::sync::Mutex; +use strum::IntoEnumIterator; -use crate::persistence::{DatabaseError, DatabaseOperation}; +use crate::{ + persistence::{DatabaseError, DatabaseOperation}, + service::LeadershipStatus, +}; pub(crate) static METRICS_REGISTRY: Lazy = Lazy::new(StorageControllerMetrics::default); @@ -81,6 +85,8 @@ pub(crate) struct StorageControllerMetricGroup { #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_database_query_latency: measured::HistogramVec, + + pub(crate) storage_controller_leadership_status: measured::GaugeVec, } impl StorageControllerMetrics { @@ -156,6 +162,12 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup { pub(crate) operation: DatabaseOperation, } +#[derive(measured::LabelGroup)] +#[label(set = LeadershipStatusGroupSet)] +pub(crate) struct LeadershipStatusGroup { + pub(crate) status: LeadershipStatus, +} + #[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { #[label(rename = "ok")] @@ -218,6 +230,7 @@ pub(crate) enum DatabaseErrorLabel { Connection, ConnectionPool, Logical, + Migration, } impl DatabaseError { @@ -227,6 +240,22 @@ impl DatabaseError { Self::Connection(_) => DatabaseErrorLabel::Connection, Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool, Self::Logical(_) => DatabaseErrorLabel::Logical, + Self::Migration(_) => DatabaseErrorLabel::Migration, + } + } +} + +/// Update the leadership status metric gauges to reflect the requested status +pub(crate) fn update_leadership_status(status: LeadershipStatus) { + let status_metric = &METRICS_REGISTRY + .metrics_group + .storage_controller_leadership_status; + + for s in LeadershipStatus::iter() { + if s == status { + status_metric.set(LeadershipStatusGroup { status: s }, 1); + } else { + status_metric.set(LeadershipStatusGroup { status: s }, 0); } } } diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 4d17dff9fe..ea765ca123 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration}; use pageserver_api::{ controller_api::{ NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, - TenantLocateResponseShard, UtilizationScore, + TenantLocateResponseShard, }, shard::TenantShardId, }; @@ -46,6 +46,8 @@ pub(crate) struct Node { /// whether/how they changed it. pub(crate) enum AvailabilityTransition { ToActive, + ToWarmingUpFromActive, + ToWarmingUpFromOffline, ToOffline, Unchanged, } @@ -90,22 +92,34 @@ impl Node { } } + pub(crate) fn get_availability(&self) -> NodeAvailability { + self.availability + } + pub(crate) fn set_availability(&mut self, availability: NodeAvailability) { + use AvailabilityTransition::*; + use NodeAvailability::WarmingUp; + match self.get_availability_transition(availability) { - AvailabilityTransition::ToActive => { + ToActive => { // Give the node a new cancellation token, effectively resetting it to un-cancelled. Any // users of previously-cloned copies of the node will still see the old cancellation // state. For example, Reconcilers in flight will have to complete and be spawned // again to realize that the node has become available. self.cancel = CancellationToken::new(); } - AvailabilityTransition::ToOffline => { + ToOffline | ToWarmingUpFromActive => { // Fire the node's cancellation token to cancel any in-flight API requests to it self.cancel.cancel(); } - AvailabilityTransition::Unchanged => {} + Unchanged | ToWarmingUpFromOffline => {} + } + + if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) { + self.availability = WarmingUp(std::cmp::max(crnt, proposed)); + } else { + self.availability = availability; } - self.availability = availability; } /// Without modifying the availability of the node, convert the intended availability @@ -120,16 +134,10 @@ impl Node { match (self.availability, availability) { (Offline, Active(_)) => ToActive, (Active(_), Offline) => ToOffline, - // Consider the case when the storage controller handles the re-attach of a node - // before the heartbeats detect that the node is back online. We still need - // [`Service::node_configure`] to attempt reconciliations for shards with an - // unknown observed location. - // The unsavoury match arm below handles this situation. - (Active(lhs), Active(rhs)) - if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() => - { - ToActive - } + (Active(_), WarmingUp(_)) => ToWarmingUpFromActive, + (WarmingUp(_), Offline) => ToOffline, + (WarmingUp(_), Active(_)) => ToActive, + (Offline, WarmingUp(_)) => ToWarmingUpFromOffline, _ => Unchanged, } } @@ -147,7 +155,7 @@ impl Node { pub(crate) fn may_schedule(&self) -> MaySchedule { let score = match self.availability { NodeAvailability::Active(score) => score, - NodeAvailability::Offline => return MaySchedule::No, + NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No, }; match self.scheduling { @@ -226,7 +234,7 @@ impl Node { fn is_fatal(e: &mgmt_api::Error) -> bool { use mgmt_api::Error::*; match e { - ReceiveBody(_) | ReceiveErrorBody(_) => false, + SendRequest(_) | ReceiveBody(_) | ReceiveErrorBody(_) => false, ApiError(StatusCode::SERVICE_UNAVAILABLE, _) | ApiError(StatusCode::GATEWAY_TIMEOUT, _) | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 769aba80ca..8d64201cd9 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,8 +1,9 @@ use pageserver_api::{ models::{ - LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, - TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, - TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, + detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, + PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, + TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, + TopTenantShardsRequest, TopTenantShardsResponse, }, shard::TenantShardId, }; @@ -226,6 +227,21 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_detach_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "timeline_detach_ancestor", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs new file mode 100644 index 0000000000..3f8520fe55 --- /dev/null +++ b/storage_controller/src/peer_client.rs @@ -0,0 +1,108 @@ +use crate::tenant_shard::ObservedState; +use pageserver_api::shard::TenantShardId; +use serde::{Deserialize, Serialize}; +use std::{collections::HashMap, time::Duration}; +use tokio_util::sync::CancellationToken; + +use hyper::Uri; +use reqwest::{StatusCode, Url}; +use utils::{backoff, http::error::HttpErrorBody}; + +#[derive(Debug, Clone)] +pub(crate) struct PeerClient { + uri: Uri, + jwt: Option, + client: reqwest::Client, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum StorageControllerPeerError { + #[error("failed to deserialize error response with status code {0} at {1}: {2}")] + DeserializationError(StatusCode, Url, reqwest::Error), + #[error("storage controller peer API error ({0}): {1}")] + ApiError(StatusCode, String), + #[error("failed to send HTTP request: {0}")] + SendError(reqwest::Error), + #[error("Cancelled")] + Cancelled, +} + +pub(crate) type Result = std::result::Result; + +pub(crate) trait ResponseErrorMessageExt: Sized { + fn error_from_body(self) -> impl std::future::Future> + Send; +} + +impl ResponseErrorMessageExt for reqwest::Response { + async fn error_from_body(self) -> Result { + let status = self.status(); + if !(status.is_client_error() || status.is_server_error()) { + return Ok(self); + } + + let url = self.url().to_owned(); + Err(match self.json::().await { + Ok(HttpErrorBody { msg }) => StorageControllerPeerError::ApiError(status, msg), + Err(err) => StorageControllerPeerError::DeserializationError(status, url, err), + }) + } +} + +#[derive(Serialize, Deserialize, Debug, Default)] +pub(crate) struct GlobalObservedState(pub(crate) HashMap); + +impl PeerClient { + pub(crate) fn new(uri: Uri, jwt: Option) -> Self { + Self { + uri, + jwt, + client: reqwest::Client::new(), + } + } + + async fn request_step_down(&self) -> Result { + let step_down_path = format!("{}control/v1/step_down", self.uri); + let req = self.client.put(step_down_path); + let req = if let Some(jwt) = &self.jwt { + req.header(reqwest::header::AUTHORIZATION, format!("Bearer {jwt}")) + } else { + req + }; + + let req = req.timeout(Duration::from_secs(2)); + + let res = req + .send() + .await + .map_err(StorageControllerPeerError::SendError)?; + let response = res.error_from_body().await?; + + let status = response.status(); + let url = response.url().to_owned(); + + response + .json() + .await + .map_err(|err| StorageControllerPeerError::DeserializationError(status, url, err)) + } + + /// Request the peer to step down and return its current observed state + /// All errors are retried with exponential backoff for a maximum of 4 attempts. + /// Assuming all retries are performed, the function times out after roughly 4 seconds. + pub(crate) async fn step_down( + &self, + cancel: &CancellationToken, + ) -> Result { + backoff::retry( + || self.request_step_down(), + |_e| false, + 2, + 4, + "Send step down request", + cancel, + ) + .await + .ok_or_else(|| StorageControllerPeerError::Cancelled) + .and_then(|x| x) + } +} diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 47caf7ae81..16df19026c 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -5,11 +5,10 @@ use std::time::Duration; use std::time::Instant; use self::split_state::SplitState; -use camino::Utf8Path; -use camino::Utf8PathBuf; use diesel::pg::PgConnection; use diesel::prelude::*; use diesel::Connection; +use pageserver_api::controller_api::MetadataHealthRecord; use pageserver_api::controller_api::ShardSchedulingPolicy; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::models::TenantConfig; @@ -26,6 +25,9 @@ use crate::metrics::{ }; use crate::node::Node; +use diesel_migrations::{embed_migrations, EmbeddedMigrations}; +const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations"); + /// ## What do we store? /// /// The storage controller service does not store most of its state durably. @@ -55,11 +57,6 @@ use crate::node::Node; /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline. pub struct Persistence { connection_pool: diesel::r2d2::Pool>, - - // In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of - // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward - // compatible just yet. - json_path: Option, } /// Legacy format, for use in JSON compat objects in test environment @@ -78,6 +75,8 @@ pub(crate) enum DatabaseError { ConnectionPool(#[from] r2d2::Error), #[error("Logical error: {0}")] Logical(String), + #[error("Migration error: {0}")] + Migration(String), } #[derive(measured::FixedCardinalityLabel, Copy, Clone)] @@ -97,6 +96,12 @@ pub(crate) enum DatabaseOperation { UpdateTenantShard, DeleteTenant, UpdateTenantConfig, + UpdateMetadataHealth, + ListMetadataHealth, + ListMetadataHealthUnhealthy, + ListMetadataHealthOutdated, + GetLeader, + UpdateLeader, } #[must_use] @@ -124,7 +129,7 @@ impl Persistence { const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10); const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60); - pub fn new(database_url: String, json_path: Option) -> Self { + pub fn new(database_url: String) -> Self { let manager = diesel::r2d2::ConnectionManager::::new(database_url); // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time @@ -139,10 +144,7 @@ impl Persistence { .build(manager) .expect("Could not build connection pool"); - Self { - connection_pool, - json_path, - } + Self { connection_pool } } /// A helper for use during startup, where we would like to tolerate concurrent restarts of the @@ -170,6 +172,19 @@ impl Persistence { } } + /// Execute the diesel migrations that are built into this binary + pub(crate) async fn migration_run(&self) -> DatabaseResult<()> { + use diesel_migrations::{HarnessWithOutput, MigrationHarness}; + + self.with_conn(move |conn| -> DatabaseResult<()> { + HarnessWithOutput::write_to_stdout(conn) + .run_pending_migrations(MIGRATIONS) + .map(|_| ()) + .map_err(|e| DatabaseError::Migration(e.to_string())) + }) + .await + } + /// Wraps `with_conn` in order to collect latency and error metrics async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult where @@ -302,85 +317,13 @@ impl Persistence { /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { - let loaded = self - .with_measured_conn( - DatabaseOperation::ListTenantShards, - move |conn| -> DatabaseResult<_> { - Ok(crate::schema::tenant_shards::table.load::(conn)?) - }, - ) - .await?; - - if loaded.is_empty() { - if let Some(path) = &self.json_path { - if tokio::fs::try_exists(path) - .await - .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))? - { - tracing::info!("Importing from legacy JSON format at {path}"); - return self.list_tenant_shards_json(path).await; - } - } - } - Ok(loaded) - } - - /// Shim for automated compatibility tests: load tenants from a JSON file instead of database - pub(crate) async fn list_tenant_shards_json( - &self, - path: &Utf8Path, - ) -> DatabaseResult> { - let bytes = tokio::fs::read(path) - .await - .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?; - - let mut decoded = serde_json::from_slice::(&bytes) - .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?; - for shard in decoded.tenants.values_mut() { - if shard.placement_policy == "\"Single\"" { - // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165 - shard.placement_policy = "{\"Attached\":0}".to_string(); - } - - if shard.scheduling_policy.is_empty() { - shard.scheduling_policy = - serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap(); - } - } - - let tenants: Vec = decoded.tenants.into_values().collect(); - - // Synchronize database with what is in the JSON file - self.insert_tenant_shards(tenants.clone()).await?; - - Ok(tenants) - } - - /// For use in testing environments, where we dump out JSON on shutdown. - pub async fn write_tenants_json(&self) -> anyhow::Result<()> { - let Some(path) = &self.json_path else { - anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)"); - }; - tracing::info!("Writing state to {path}..."); - let tenants = self.list_tenant_shards().await?; - let mut tenants_map = HashMap::new(); - for tsp in tenants { - let tenant_shard_id = TenantShardId { - tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, - shard_number: ShardNumber(tsp.shard_number as u8), - shard_count: ShardCount::new(tsp.shard_count as u8), - }; - - tenants_map.insert(tenant_shard_id, tsp); - } - let json = serde_json::to_string(&JsonPersistence { - tenants: tenants_map, - })?; - - tokio::fs::write(path, &json).await?; - tracing::info!("Wrote {} bytes to {path}...", json.len()); - - Ok(()) + self.with_measured_conn( + DatabaseOperation::ListTenantShards, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::tenant_shards::table.load::(conn)?) + }, + ) + .await } /// Tenants must be persisted before we schedule them for the first time. This enables us @@ -389,15 +332,32 @@ impl Persistence { &self, shards: Vec, ) -> DatabaseResult<()> { - use crate::schema::tenant_shards::dsl::*; + use crate::schema::metadata_health; + use crate::schema::tenant_shards; + + let now = chrono::Utc::now(); + + let metadata_health_records = shards + .iter() + .map(|t| MetadataHealthPersistence { + tenant_id: t.tenant_id.clone(), + shard_number: t.shard_number, + shard_count: t.shard_count, + healthy: true, + last_scrubbed_at: now, + }) + .collect::>(); + self.with_measured_conn( DatabaseOperation::InsertTenantShards, move |conn| -> DatabaseResult<()> { - for tenant in &shards { - diesel::insert_into(tenant_shards) - .values(tenant) - .execute(conn)?; - } + diesel::insert_into(tenant_shards::table) + .values(&shards) + .execute(conn)?; + + diesel::insert_into(metadata_health::table) + .values(&metadata_health_records) + .execute(conn)?; Ok(()) }, ) @@ -411,10 +371,10 @@ impl Persistence { self.with_measured_conn( DatabaseOperation::DeleteTenant, move |conn| -> DatabaseResult<()> { + // `metadata_health` status (if exists) is also deleted based on the cascade behavior. diesel::delete(tenant_shards) .filter(tenant_id.eq(del_tenant_id.to_string())) .execute(conn)?; - Ok(()) }, ) @@ -542,6 +502,7 @@ impl Persistence { Ok(Generation::new(g as u32)) } + #[allow(non_local_definitions)] /// For use when updating a persistent property of a tenant, such as its config or placement_policy. /// /// Do not use this for settting generation, unless in the special onboarding code path (/location_config) @@ -756,6 +717,157 @@ impl Persistence { ) .await } + + /// Stores all the latest metadata health updates durably. Updates existing entry on conflict. + /// + /// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller. + #[allow(dead_code)] + pub(crate) async fn update_metadata_health_records( + &self, + healthy_records: Vec, + unhealthy_records: Vec, + now: chrono::DateTime, + ) -> DatabaseResult<()> { + use crate::schema::metadata_health::dsl::*; + + self.with_measured_conn( + DatabaseOperation::UpdateMetadataHealth, + move |conn| -> DatabaseResult<_> { + diesel::insert_into(metadata_health) + .values(&healthy_records) + .on_conflict((tenant_id, shard_number, shard_count)) + .do_update() + .set((healthy.eq(true), last_scrubbed_at.eq(now))) + .execute(conn)?; + + diesel::insert_into(metadata_health) + .values(&unhealthy_records) + .on_conflict((tenant_id, shard_number, shard_count)) + .do_update() + .set((healthy.eq(false), last_scrubbed_at.eq(now))) + .execute(conn)?; + Ok(()) + }, + ) + .await + } + + /// Lists all the metadata health records. + #[allow(dead_code)] + pub(crate) async fn list_metadata_health_records( + &self, + ) -> DatabaseResult> { + self.with_measured_conn( + DatabaseOperation::ListMetadataHealth, + move |conn| -> DatabaseResult<_> { + Ok( + crate::schema::metadata_health::table + .load::(conn)?, + ) + }, + ) + .await + } + + /// Lists all the metadata health records that is unhealthy. + #[allow(dead_code)] + pub(crate) async fn list_unhealthy_metadata_health_records( + &self, + ) -> DatabaseResult> { + use crate::schema::metadata_health::dsl::*; + self.with_measured_conn( + DatabaseOperation::ListMetadataHealthUnhealthy, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::metadata_health::table + .filter(healthy.eq(false)) + .load::(conn)?) + }, + ) + .await + } + + /// Lists all the metadata health records that have not been updated since an `earlier` time. + #[allow(dead_code)] + pub(crate) async fn list_outdated_metadata_health_records( + &self, + earlier: chrono::DateTime, + ) -> DatabaseResult> { + use crate::schema::metadata_health::dsl::*; + + self.with_measured_conn( + DatabaseOperation::ListMetadataHealthOutdated, + move |conn| -> DatabaseResult<_> { + let query = metadata_health.filter(last_scrubbed_at.lt(earlier)); + let res = query.load::(conn)?; + + Ok(res) + }, + ) + .await + } + + /// Get the current entry from the `leader` table if one exists. + /// It is an error for the table to contain more than one entry. + pub(crate) async fn get_leader(&self) -> DatabaseResult> { + let mut leader: Vec = self + .with_measured_conn( + DatabaseOperation::GetLeader, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::controllers::table.load::(conn)?) + }, + ) + .await?; + + if leader.len() > 1 { + return Err(DatabaseError::Logical(format!( + "More than one entry present in the leader table: {leader:?}" + ))); + } + + Ok(leader.pop()) + } + + /// Update the new leader with compare-exchange semantics. If `prev` does not + /// match the current leader entry, then the update is treated as a failure. + /// When `prev` is not specified, the update is forced. + pub(crate) async fn update_leader( + &self, + prev: Option, + new: ControllerPersistence, + ) -> DatabaseResult<()> { + use crate::schema::controllers::dsl::*; + + let updated = self + .with_measured_conn( + DatabaseOperation::UpdateLeader, + move |conn| -> DatabaseResult { + let updated = match &prev { + Some(prev) => diesel::update(controllers) + .filter(address.eq(prev.address.clone())) + .filter(started_at.eq(prev.started_at)) + .set(( + address.eq(new.address.clone()), + started_at.eq(new.started_at), + )) + .execute(conn)?, + None => diesel::insert_into(controllers) + .values(new.clone()) + .execute(conn)?, + }; + + Ok(updated) + }, + ) + .await?; + + if updated == 0 { + return Err(DatabaseError::Logical( + "Leader table update failed".to_string(), + )); + } + + Ok(()) + } } /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably @@ -825,3 +937,68 @@ pub(crate) struct NodePersistence { pub(crate) listen_pg_addr: String, pub(crate) listen_pg_port: i32, } + +/// Tenant metadata health status that are stored durably. +#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] +#[diesel(table_name = crate::schema::metadata_health)] +pub(crate) struct MetadataHealthPersistence { + #[serde(default)] + pub(crate) tenant_id: String, + #[serde(default)] + pub(crate) shard_number: i32, + #[serde(default)] + pub(crate) shard_count: i32, + + pub(crate) healthy: bool, + pub(crate) last_scrubbed_at: chrono::DateTime, +} + +impl MetadataHealthPersistence { + pub fn new( + tenant_shard_id: TenantShardId, + healthy: bool, + last_scrubbed_at: chrono::DateTime, + ) -> Self { + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_number = tenant_shard_id.shard_number.0 as i32; + let shard_count = tenant_shard_id.shard_count.literal() as i32; + + MetadataHealthPersistence { + tenant_id, + shard_number, + shard_count, + healthy, + last_scrubbed_at, + } + } + + #[allow(dead_code)] + pub(crate) fn get_tenant_shard_id(&self) -> Result { + Ok(TenantShardId { + tenant_id: TenantId::from_str(self.tenant_id.as_str())?, + shard_number: ShardNumber(self.shard_number as u8), + shard_count: ShardCount::new(self.shard_count as u8), + }) + } +} + +impl From for MetadataHealthRecord { + fn from(value: MetadataHealthPersistence) -> Self { + MetadataHealthRecord { + tenant_shard_id: value + .get_tenant_shard_id() + .expect("stored tenant id should be valid"), + healthy: value.healthy, + last_scrubbed_at: value.last_scrubbed_at, + } + } +} + +#[derive( + Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq, Debug, Clone, +)] +#[diesel(table_name = crate::schema::controllers)] +pub(crate) struct ControllerPersistence { + pub(crate) address: String, + pub(crate) started_at: chrono::DateTime, +} diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index fe97f724c1..94db879ade 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,6 +1,7 @@ use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; use crate::service; +use pageserver_api::controller_api::PlacementPolicy; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; @@ -11,6 +12,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; +use utils::failpoint_support; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use utils::lsn::Lsn; @@ -29,6 +31,7 @@ pub(super) struct Reconciler { /// of a tenant's state from when we spawned a reconcile task. pub(super) tenant_shard_id: TenantShardId, pub(crate) shard: ShardIdentity, + pub(crate) placement_policy: PlacementPolicy, pub(crate) generation: Option, pub(crate) intent: TargetState, @@ -36,6 +39,9 @@ pub(super) struct Reconciler { /// to detach this tenant shard. pub(crate) detach: Vec, + /// Configuration specific to this reconciler + pub(crate) reconciler_config: ReconcilerConfig, + pub(crate) config: TenantConfig, pub(crate) observed: ObservedState, @@ -70,6 +76,65 @@ pub(super) struct Reconciler { pub(crate) persistence: Arc, } +pub(crate) struct ReconcilerConfigBuilder { + config: ReconcilerConfig, +} + +impl ReconcilerConfigBuilder { + pub(crate) fn new() -> Self { + Self { + config: ReconcilerConfig::default(), + } + } + + pub(crate) fn secondary_warmup_timeout(self, value: Duration) -> Self { + Self { + config: ReconcilerConfig { + secondary_warmup_timeout: Some(value), + ..self.config + }, + } + } + + pub(crate) fn secondary_download_request_timeout(self, value: Duration) -> Self { + Self { + config: ReconcilerConfig { + secondary_download_request_timeout: Some(value), + ..self.config + }, + } + } + + pub(crate) fn build(self) -> ReconcilerConfig { + self.config + } +} + +#[derive(Default, Debug, Copy, Clone)] +pub(crate) struct ReconcilerConfig { + // During live migration give up on warming-up the secondary + // after this timeout. + secondary_warmup_timeout: Option, + + // During live migrations this is the amount of time that + // the pagserver will hold our poll. + secondary_download_request_timeout: Option, +} + +impl ReconcilerConfig { + pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration { + const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300); + self.secondary_warmup_timeout + .unwrap_or(SECONDARY_WARMUP_TIMEOUT_DEFAULT) + } + + pub(crate) fn get_secondary_download_request_timeout(&self) -> Duration { + const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT: Duration = Duration::from_secs(20); + self.secondary_download_request_timeout + .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT) + } +} + /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O pub(crate) struct ReconcileUnits { _sem_units: tokio::sync::OwnedSemaphorePermit, @@ -297,11 +362,13 @@ impl Reconciler { ) -> Result<(), ReconcileError> { // This is not the timeout for a request, but the total amount of time we're willing to wait // for a secondary location to get up to date before - const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300); + let total_download_timeout = self.reconciler_config.get_secondary_warmup_timeout(); // This the long-polling interval for the secondary download requests we send to destination pageserver // during a migration. - const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20); + let request_download_timeout = self + .reconciler_config + .get_secondary_download_request_timeout(); let started_at = Instant::now(); @@ -312,14 +379,14 @@ impl Reconciler { client .tenant_secondary_download( tenant_shard_id, - Some(REQUEST_DOWNLOAD_TIMEOUT), + Some(request_download_timeout), ) .await }, &self.service_config.jwt_token, 1, 3, - REQUEST_DOWNLOAD_TIMEOUT * 2, + request_download_timeout * 2, &self.cancel, ) .await @@ -347,7 +414,7 @@ impl Reconciler { return Ok(()); } else if status == StatusCode::ACCEPTED { let total_runtime = started_at.elapsed(); - if total_runtime > TOTAL_DOWNLOAD_TIMEOUT { + if total_runtime > total_download_timeout { tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", total_runtime.as_millis(), progress.layers_downloaded, @@ -641,7 +708,7 @@ impl Reconciler { generation, &self.shard, &self.config, - !self.intent.secondary.is_empty(), + &self.placement_policy, ); match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { @@ -653,11 +720,8 @@ impl Reconciler { // reconcile this location. This includes locations with different configurations, as well // as locations with unknown (None) observed state. - // The general case is to increment the generation. However, there are cases - // where this is not necessary: - // - if we are only updating the TenantConf part of the location - // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale) - // and the location was already in the correct generation + // Incrementing generation is the safe general case, but is inefficient for changes that only + // modify some details (e.g. the tenant's config). let increment_generation = match observed { None => true, Some(ObservedStateLocation { conf: None }) => true, @@ -666,18 +730,11 @@ impl Reconciler { }) => { let generations_match = observed.generation == wanted_conf.generation; - use LocationConfigMode::*; - let mode_transition_requires_gen_inc = - match (observed.mode, wanted_conf.mode) { - // Usually the short-lived attachment modes (multi and stale) are only used - // in the case of [`Self::live_migrate`], but it is simple to handle them correctly - // here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation. - (AttachedSingle, AttachedStale) => false, - (AttachedMulti, AttachedSingle) => false, - (lhs, rhs) => lhs != rhs, - }; - - !generations_match || mode_transition_requires_gen_inc + // We may skip incrementing the generation if the location is already in the expected mode and + // generation. In principle it would also be safe to skip from certain other modes (e.g. AttachedStale), + // but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up + // after a restart/crash, so fall back to the universally safe path of incrementing generation. + !generations_match || (observed.mode != wanted_conf.mode) } }; @@ -747,6 +804,8 @@ impl Reconciler { self.location_config(&node, conf, None, false).await?; } + failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue"); + Ok(()) } @@ -801,8 +860,15 @@ pub(crate) fn attached_location_conf( generation: Generation, shard: &ShardIdentity, config: &TenantConfig, - has_secondaries: bool, + policy: &PlacementPolicy, ) -> LocationConfig { + let has_secondaries = match policy { + PlacementPolicy::Attached(0) | PlacementPolicy::Detached | PlacementPolicy::Secondary => { + false + } + PlacementPolicy::Attached(_) => true, + }; + LocationConfig { mode: LocationConfigMode::AttachedSingle, generation: generation.into(), diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 0bd2eeac35..843159010d 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -391,7 +391,7 @@ impl Scheduler { return Err(ScheduleError::NoPageservers); } - let mut scores: Vec<(NodeId, AffinityScore, usize)> = self + let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self .nodes .iter() .filter_map(|(k, v)| { @@ -402,6 +402,7 @@ impl Scheduler { *k, context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE), v.shard_count, + v.attached_shard_count, )) } }) @@ -409,9 +410,12 @@ impl Scheduler { // Sort by, in order of precedence: // 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available - // 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes. - // 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems. - scores.sort_by_key(|i| (i.1, i.2, i.0)); + // 2nd: Attached shard count. Within nodes with the same affinity, we always pick the node with + // the least number of attached shards. + // 3rd: Total shard count. Within nodes with the same affinity and attached shard count, use nodes + // with the lower total shard count. + // 4th: Node ID. This is a convenience to make selection deterministic in tests and empty systems. + scores.sort_by_key(|i| (i.1, i.3, i.2, i.0)); if scores.is_empty() { // After applying constraints, no pageservers were left. diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index ff37d0fe77..77ba47e114 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -1,5 +1,22 @@ // @generated automatically by Diesel CLI. +diesel::table! { + controllers (address, started_at) { + address -> Varchar, + started_at -> Timestamptz, + } +} + +diesel::table! { + metadata_health (tenant_id, shard_number, shard_count) { + tenant_id -> Varchar, + shard_number -> Int4, + shard_count -> Int4, + healthy -> Bool, + last_scrubbed_at -> Timestamptz, + } +} + diesel::table! { nodes (node_id) { node_id -> Int8, @@ -26,4 +43,4 @@ diesel::table! { } } -diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,); +diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,); diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 8475bf46d2..453e96bad3 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,7 +1,9 @@ +use hyper::Uri; use std::{ borrow::Cow, cmp::Ordering, collections::{BTreeMap, HashMap, HashSet}, + ops::Deref, path::PathBuf, str::FromStr, sync::Arc, @@ -13,9 +15,16 @@ use crate::{ Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION, }, compute_hook::NotifyError, - id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard}, - persistence::{AbortShardSplitStatus, TenantFilter}, - reconciler::{ReconcileError, ReconcileUnits}, + drain_utils::{self, TenantShardDrain, TenantShardIterator}, + id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, + leadership::Leadership, + metrics, + peer_client::GlobalObservedState, + persistence::{ + AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, + TenantFilter, + }, + reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, scheduler::{MaySchedule, ScheduleContext, ScheduleMode}, tenant_shard::{ MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization, @@ -31,11 +40,11 @@ use futures::{stream::FuturesUnordered, StreamExt}; use itertools::Itertools; use pageserver_api::{ controller_api::{ - NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, - ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard, - TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, - TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, - UtilizationScore, + MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, + NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest, + TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, + TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, + TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore, }, models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest}, }; @@ -46,10 +55,9 @@ use crate::pageserver_client::PageserverClient; use pageserver_api::{ models::{ self, LocationConfig, LocationConfigListResponse, LocationConfigMode, - PageserverUtilization, ShardParameters, TenantConfig, TenantCreateRequest, - TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, - TenantShardSplitRequest, TenantShardSplitResponse, TenantTimeTravelRequest, - TimelineCreateRequest, TimelineInfo, + PageserverUtilization, ShardParameters, TenantConfig, TenantLocationConfigRequest, + TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, + TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo, }, shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, upcall_api::{ @@ -82,6 +90,8 @@ use crate::{ }, }; +pub mod chaos_injector; + // For operations that should be quick, like attaching a new tenant const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); @@ -100,9 +110,13 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); /// How long a node may be unresponsive to heartbeats before we declare it offline. /// This must be long enough to cover node restarts as well as normal operations: in future -/// it should be separated into distinct timeouts for startup vs. normal operation -/// (``) -pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); +pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); + +/// How long a node may be unresponsive to heartbeats during start up before we declare it +/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's +/// handling of the re-attach response may take a long time and blocks heartbeats from +/// being handled on the pageserver side. +pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); #[derive(Clone, strum_macros::Display)] enum TenantOperations { @@ -116,12 +130,41 @@ enum TenantOperations { SecondaryDownload, TimelineCreate, TimelineDelete, + AttachHook, + TimelineDetachAncestor, } #[derive(Clone, strum_macros::Display)] enum NodeOperations { Register, Configure, + Delete, +} + +/// The leadership status for the storage controller process. +/// Allowed transitions are: +/// 1. Leader -> SteppedDown +/// 2. Candidate -> Leader +#[derive( + Eq, + PartialEq, + Copy, + Clone, + strum_macros::Display, + strum_macros::EnumIter, + measured::FixedCardinalityLabel, +)] +#[strum(serialize_all = "snake_case")] +pub(crate) enum LeadershipStatus { + /// This is the steady state where the storage controller can produce + /// side effects in the cluster. + Leader, + /// We've been notified to step down by another candidate. No reconciliations + /// take place in this state. + SteppedDown, + /// Initial state for a new storage controller instance. Will attempt to assume leadership. + #[allow(unused)] + Candidate, } pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; @@ -133,6 +176,8 @@ const MAX_DELAYED_RECONCILES: usize = 10000; // Top level state available to all HTTP handlers struct ServiceState { + leadership_status: LeadershipStatus, + tenants: BTreeMap, nodes: Arc>, @@ -152,6 +197,10 @@ struct ServiceState { /// controller API. fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { match e { + mgmt_api::Error::SendRequest(e) => { + // Presume errors sending requests are connectivity/availability issues + ApiError::ResourceUnavailable(format!("{node} error sending request: {e}").into()) + } mgmt_api::Error::ReceiveErrorBody(str) => { // Presume errors receiving body are connectivity/availability issues ApiError::ResourceUnavailable( @@ -190,8 +239,12 @@ impl ServiceState { tenants: BTreeMap, scheduler: Scheduler, delayed_reconcile_rx: tokio::sync::mpsc::Receiver, + initial_leadership_status: LeadershipStatus, ) -> Self { + metrics::update_leadership_status(initial_leadership_status); + Self { + leadership_status: initial_leadership_status, tenants, nodes: Arc::new(nodes), scheduler, @@ -209,6 +262,20 @@ impl ServiceState { ) { (&mut self.nodes, &mut self.tenants, &mut self.scheduler) } + + fn get_leadership_status(&self) -> LeadershipStatus { + self.leadership_status + } + + fn step_down(&mut self) { + self.leadership_status = LeadershipStatus::SteppedDown; + metrics::update_leadership_status(self.leadership_status); + } + + fn become_leader(&mut self) { + self.leadership_status = LeadershipStatus::Leader; + metrics::update_leadership_status(self.leadership_status); + } } #[derive(Clone)] @@ -221,6 +288,9 @@ pub struct Config { // This JWT token will be used to authenticate this service to the control plane. pub control_plane_jwt_token: Option, + // This JWT token will be used to authenticate with other storage controller instances + pub peer_jwt_token: Option, + /// Where the compute hook should send notifications of pageserver attachment locations /// (this URL points to the control plane in prod). If this is None, the compute hook will /// assume it is running in a test environment and try to update neon_local. @@ -229,7 +299,12 @@ pub struct Config { /// Grace period within which a pageserver does not respond to heartbeats, but is still /// considered active. Once the grace period elapses, the next heartbeat failure will /// mark the pagseserver offline. - pub max_unavailable_interval: Duration, + pub max_offline_interval: Duration, + + /// Extended grace period within which pageserver may not respond to heartbeats. + /// This extended grace period kicks in after the node has been drained for restart + /// and/or upon handling the re-attach request from a node. + pub max_warming_up_interval: Duration, /// How many Reconcilers may be spawned concurrently pub reconciler_concurrency: usize, @@ -240,6 +315,18 @@ pub struct Config { // TODO: make this cfg(feature = "testing") pub neon_local_repo_dir: Option, + + // Maximum acceptable download lag for the secondary location + // while draining a node. If the secondary location is lagging + // by more than the configured amount, then the secondary is not + // upgraded to primary. + pub max_secondary_lag_bytes: Option, + + pub address_for_peers: Option, + + pub start_as_candidate: bool, + + pub http_service_port: i32, } impl From for ApiError { @@ -250,7 +337,7 @@ impl From for ApiError { DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => { ApiError::ShuttingDown } - DatabaseError::Logical(reason) => { + DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => { ApiError::InternalServerError(anyhow::anyhow!(reason)) } } @@ -262,7 +349,7 @@ pub struct Service { config: Config, persistence: Arc, compute_hook: Arc, - result_tx: tokio::sync::mpsc::UnboundedSender, + result_tx: tokio::sync::mpsc::UnboundedSender, heartbeater: Heartbeater, @@ -292,9 +379,15 @@ pub struct Service { // Process shutdown will fire this token cancel: CancellationToken, + // Child token of [`Service::cancel`] used by reconcilers + reconcilers_cancel: CancellationToken, + // Background tasks will hold this gate gate: Gate, + // Reconcilers background tasks will hold this gate + reconcilers_gate: Gate, + /// This waits for initial reconciliation with pageservers to complete. Until this barrier /// passes, it isn't safe to do any actions that mutate tenants. pub(crate) startup_complete: Barrier, @@ -359,7 +452,7 @@ struct TenantShardSplitAbort { new_shard_count: ShardCount, new_stripe_size: Option, /// Until this abort op is complete, no other operations may be done on the tenant - _tenant_lock: WrappedWriteGuard, + _tenant_lock: TracingExclusiveGuard, } #[derive(thiserror::Error, Debug)] @@ -381,6 +474,26 @@ struct ShardUpdate { generation: Option, } +enum StopReconciliationsReason { + ShuttingDown, + SteppingDown, +} + +impl std::fmt::Display for StopReconciliationsReason { + fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result { + let s = match self { + Self::ShuttingDown => "Shutting down", + Self::SteppingDown => "Stepping down", + }; + write!(writer, "{}", s) + } +} + +pub(crate) enum ReconcileResultRequest { + ReconcileResult(ReconcileResult), + Stop, +} + impl Service { pub fn get_config(&self) -> &Config { &self.config @@ -391,15 +504,12 @@ impl Service { #[instrument(skip_all)] async fn startup_reconcile( self: &Arc, + current_leader: Option, + leader_step_down_state: Option, bg_compute_notify_result_tx: tokio::sync::mpsc::Sender< Result<(), (TenantShardId, NotifyError)>, >, ) { - // For all tenant shards, a vector of observed states on nodes (where None means - // indeterminate, same as in [`ObservedStateLocation`]) - let mut observed: HashMap)>> = - HashMap::new(); - // Startup reconciliation does I/O to other services: whether they // are responsive or not, we should aim to finish within our deadline, because: // - If we don't, a k8s readiness hook watching /ready will kill us. @@ -413,26 +523,26 @@ impl Service { .checked_add(STARTUP_RECONCILE_TIMEOUT / 2) .expect("Reconcile timeout is a modest constant"); + let observed = if let Some(state) = leader_step_down_state { + tracing::info!( + "Using observed state received from leader at {}", + current_leader.as_ref().unwrap().address + ); + + state + } else { + self.build_global_observed_state(node_scan_deadline).await + }; + // Accumulate a list of any tenant locations that ought to be detached let mut cleanup = Vec::new(); - let node_listings = self.scan_node_locations(node_scan_deadline).await; - // Send initial heartbeat requests to nodes that replied to the location listing above. - let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await; - - for (node_id, list_response) in node_listings { - let tenant_shards = list_response.tenant_shards; - tracing::info!( - "Received {} shard statuses from pageserver {}, setting it to Active", - tenant_shards.len(), - node_id - ); - - for (tenant_shard_id, conf_opt) in tenant_shards { - let shard_observations = observed.entry(tenant_shard_id).or_default(); - shard_observations.push((node_id, conf_opt)); - } - } + // Send initial heartbeat requests to all nodes loaded from the database + let all_nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await; // List of tenants for which we will attempt to notify compute of their location at startup let mut compute_notifications = Vec::new(); @@ -455,17 +565,16 @@ impl Service { } *nodes = Arc::new(new_nodes); - for (tenant_shard_id, shard_observations) in observed { - for (node_id, observed_loc) in shard_observations { - let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { - cleanup.push((tenant_shard_id, node_id)); - continue; - }; - tenant_shard - .observed - .locations - .insert(node_id, ObservedStateLocation { conf: observed_loc }); - } + for (tenant_shard_id, observed_state) in observed.0 { + let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { + for node_id in observed_state.locations.keys() { + cleanup.push((tenant_shard_id, *node_id)); + } + + continue; + }; + + tenant_shard.observed = observed_state; } // Populate each tenant's intent state @@ -499,6 +608,21 @@ impl Service { tenants.len() }; + // Before making any obeservable changes to the cluster, persist self + // as leader in database and memory. + let leadership = Leadership::new( + self.persistence.clone(), + self.config.clone(), + self.cancel.child_token(), + ); + + if let Err(e) = leadership.become_leader(current_leader).await { + tracing::error!("Failed to persist self as leader: {e}. Aborting start-up ..."); + std::process::exit(1); + } + + self.inner.write().unwrap().become_leader(); + // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that // generation_pageserver in the database. @@ -580,6 +704,9 @@ impl Service { online_nodes.insert(node_id, utilization); } PageserverState::Offline => {} + PageserverState::WarmingUp { .. } => { + unreachable!("Nodes are never marked warming-up during startup reconcile") + } } } } @@ -661,6 +788,31 @@ impl Service { node_results } + async fn build_global_observed_state(&self, deadline: Instant) -> GlobalObservedState { + let node_listings = self.scan_node_locations(deadline).await; + let mut observed = GlobalObservedState::default(); + + for (node_id, location_confs) in node_listings { + tracing::info!( + "Received {} shard statuses from pageserver {}", + location_confs.tenant_shards.len(), + node_id + ); + + for (tid, location_conf) in location_confs.tenant_shards { + let entry = observed.0.entry(tid).or_default(); + entry.locations.insert( + node_id, + ObservedStateLocation { + conf: location_conf, + }, + ); + } + } + + observed + } + /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers. /// /// This is safe to run in the background, because if we don't have this TenantShardId in our map of @@ -734,7 +886,7 @@ impl Service { const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20); let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD); - while !self.cancel.is_cancelled() { + while !self.reconcilers_cancel.is_cancelled() { tokio::select! { _ = interval.tick() => { let reconciles_spawned = self.reconcile_all(); @@ -747,7 +899,7 @@ impl Service { } } } - _ = self.cancel.cancelled() => return + _ = self.reconcilers_cancel.cancelled() => return } } } @@ -772,61 +924,54 @@ impl Service { let res = self.heartbeater.heartbeat(nodes).await; if let Ok(deltas) = res { for (node_id, state) in deltas.0 { - let (new_node, new_availability) = match state { - PageserverState::Available { - utilization, new, .. - } => ( - new, - NodeAvailability::Active(UtilizationScore( - utilization.utilization_score, - )), + let new_availability = match state { + PageserverState::Available { utilization, .. } => NodeAvailability::Active( + UtilizationScore(utilization.utilization_score), ), - PageserverState::Offline => (false, NodeAvailability::Offline), + PageserverState::WarmingUp { started_at } => { + NodeAvailability::WarmingUp(started_at) + } + PageserverState::Offline => { + // The node might have been placed in the WarmingUp state + // while the heartbeat round was on-going. Hence, filter out + // offline transitions for WarmingUp nodes that are still within + // their grace period. + if let Ok(NodeAvailability::WarmingUp(started_at)) = + self.get_node(node_id).await.map(|n| n.get_availability()) + { + let now = Instant::now(); + if now - started_at >= self.config.max_warming_up_interval { + NodeAvailability::Offline + } else { + NodeAvailability::WarmingUp(started_at) + } + } else { + NodeAvailability::Offline + } + } }; - if new_node { - // When the heartbeats detect a newly added node, we don't wish - // to attempt to reconcile the shards assigned to it. The node - // is likely handling it's re-attach response, so reconciling now - // would be counterproductive. - // - // Instead, update the in-memory state with the details learned about the - // node. - let mut locked = self.inner.write().unwrap(); - let (nodes, _tenants, scheduler) = locked.parts_mut(); + // This is the code path for geniune availability transitions (i.e node + // goes unavailable and/or comes back online). + let res = self + .node_configure(node_id, Some(new_availability), None) + .await; - let mut new_nodes = (**nodes).clone(); - - if let Some(node) = new_nodes.get_mut(&node_id) { - node.set_availability(new_availability); - scheduler.node_upsert(node); + match res { + Ok(()) => {} + Err(ApiError::NotFound(_)) => { + // This should be rare, but legitimate since the heartbeats are done + // on a snapshot of the nodes. + tracing::info!("Node {} was not found after heartbeat round", node_id); } - - locked.nodes = Arc::new(new_nodes); - } else { - // This is the code path for geniune availability transitions (i.e node - // goes unavailable and/or comes back online). - let res = self - .node_configure(node_id, Some(new_availability), None) - .await; - - match res { - Ok(()) => {} - Err(ApiError::NotFound(_)) => { - // This should be rare, but legitimate since the heartbeats are done - // on a snapshot of the nodes. - tracing::info!( - "Node {} was not found after heartbeat round", - node_id - ); - } - Err(err) => { - tracing::error!( - "Failed to update node {} after heartbeat round: {}", - node_id, - err - ); - } + Err(err) => { + // Transition to active involves reconciling: if a node responds to a heartbeat then + // becomes unavailable again, we may get an error here. + tracing::error!( + "Failed to update node {} after heartbeat round: {}", + node_id, + err + ); } } } @@ -842,9 +987,10 @@ impl Service { tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), sequence=%result.sequence ))] - fn process_result(&self, result: ReconcileResult) { + fn process_result(&self, mut result: ReconcileResult) { let mut locked = self.inner.write().unwrap(); - let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else { + let (nodes, tenants, _scheduler) = locked.parts_mut(); + let Some(tenant) = tenants.get_mut(&result.tenant_shard_id) else { // A reconciliation result might race with removing a tenant: drop results for // tenants that aren't in our map. return; @@ -861,6 +1007,13 @@ impl Service { // Let the TenantShard know it is idle. tenant.reconcile_complete(result.sequence); + // In case a node was deleted while this reconcile is in flight, filter it out of the update we will + // make to the tenant + result + .observed + .locations + .retain(|node_id, _loc| nodes.contains_key(node_id)); + match result.result { Ok(()) => { for (node_id, loc) in &result.observed.locations { @@ -870,6 +1023,7 @@ impl Service { tracing::info!("Setting observed location {} to None", node_id,) } } + tenant.observed = result.observed; tenant.waiter.advance(result.sequence); } @@ -916,7 +1070,7 @@ impl Service { async fn process_results( &self, - mut result_rx: tokio::sync::mpsc::UnboundedReceiver, + mut result_rx: tokio::sync::mpsc::UnboundedReceiver, mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver< Result<(), (TenantShardId, NotifyError)>, >, @@ -926,8 +1080,8 @@ impl Service { tokio::select! { r = result_rx.recv() => { match r { - Some(result) => {self.process_result(result);}, - None => {break;} + Some(ReconcileResultRequest::ReconcileResult(result)) => {self.process_result(result);}, + None | Some(ReconcileResultRequest::Stop) => {break;} } } _ = async{ @@ -953,9 +1107,6 @@ impl Service { } }; } - - // We should only fall through on shutdown - assert!(self.cancel.is_cancelled()); } async fn process_aborts( @@ -1005,6 +1156,16 @@ impl Service { let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel(); let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel(); + let leadership_cancel = CancellationToken::new(); + let leadership = Leadership::new(persistence.clone(), config.clone(), leadership_cancel); + let (leader, leader_step_down_state) = leadership.step_down_current_leader().await?; + + // Apply the migrations **after** the current leader has stepped down + // (or we've given up waiting for it), but **before** reading from the + // database. The only exception is reading the current leader before + // migrating. + persistence.migration_run().await?; + tracing::info!("Loading nodes from database..."); let nodes = persistence .list_nodes() @@ -1106,8 +1267,16 @@ impl Service { // We will populate intent properly later in [`Self::startup_reconcile`], initially populate // it with what we can infer: the node for which a generation was most recently issued. let mut intent = IntentState::new(); - if let Some(generation_pageserver) = tsp.generation_pageserver { - intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64))); + if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64)) + { + if nodes.contains_key(&generation_pageserver) { + intent.set_attached(&mut scheduler, Some(generation_pageserver)); + } else { + // If a node was removed before being completely drained, it is legal for it to leave behind a `generation_pageserver` referring + // to a non-existent node, because node deletion doesn't block on completing the reconciliations that will issue new generations + // on different pageservers. + tracing::warn!("Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled"); + } } let new_tenant = TenantShard::from_persistent(tsp, intent)?; @@ -1124,17 +1293,28 @@ impl Service { tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES); let cancel = CancellationToken::new(); + let reconcilers_cancel = cancel.child_token(); + let heartbeater = Heartbeater::new( config.jwt_token.clone(), - config.max_unavailable_interval, + config.max_offline_interval, + config.max_warming_up_interval, cancel.clone(), ); + + let initial_leadership_status = if config.start_as_candidate { + LeadershipStatus::Candidate + } else { + LeadershipStatus::Leader + }; + let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( nodes, tenants, scheduler, delayed_reconcile_rx, + initial_leadership_status, ))), config: config.clone(), persistence, @@ -1148,7 +1328,9 @@ impl Service { abort_tx, startup_complete: startup_complete.clone(), cancel, + reconcilers_cancel, gate: Gate::default(), + reconcilers_gate: Gate::default(), tenant_op_locks: Default::default(), node_op_locks: Default::default(), }); @@ -1201,7 +1383,9 @@ impl Service { return; }; - this.startup_reconcile(bg_compute_notify_result_tx).await; + this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx) + .await; + drop(startup_completion); } }); @@ -1231,6 +1415,13 @@ impl Service { &self, attach_req: AttachHookRequest, ) -> anyhow::Result { + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + attach_req.tenant_shard_id.tenant_id, + TenantOperations::AttachHook, + ) + .await; + // This is a test hook. To enable using it on tenants that were created directly with // the pageserver API (not via this service), we will auto-create any missing tenant // shards with default state. @@ -1384,7 +1575,7 @@ impl Service { tenant_shard.generation.unwrap(), &tenant_shard.shard, &tenant_shard.config, - false, + &PlacementPolicy::Attached(0), )), }, )]); @@ -1429,7 +1620,7 @@ impl Service { async fn node_activate_reconcile( &self, mut node: Node, - _lock: &WrappedWriteGuard, + _lock: &TracingExclusiveGuard, ) -> Result<(), ApiError> { // This Node is a mutable local copy: we will set it active so that we can use its // API client to reconcile with the node. The Node in [`Self::nodes`] will get updated @@ -1631,21 +1822,23 @@ impl Service { | NodeSchedulingPolicy::Filling ); - if !node.is_available() || reset_scheduling { - let mut new_nodes = (**nodes).clone(); - if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) { - if !node.is_available() { - node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); - } - - if reset_scheduling { - node.set_scheduling(NodeSchedulingPolicy::Active); - } - - scheduler.node_upsert(node); - let new_nodes = Arc::new(new_nodes); - *nodes = new_nodes; + let mut new_nodes = (**nodes).clone(); + if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) { + if reset_scheduling { + node.set_scheduling(NodeSchedulingPolicy::Active); } + + tracing::info!("Marking {} warming-up on reattach", reattach_req.node_id); + node.set_availability(NodeAvailability::WarmingUp(std::time::Instant::now())); + + scheduler.node_upsert(node); + let new_nodes = Arc::new(new_nodes); + *nodes = new_nodes; + } else { + tracing::error!( + "Reattaching node {} was removed while processing the request", + reattach_req.node_id + ); } } @@ -2346,18 +2539,18 @@ impl Service { tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); client - .tenant_time_travel_remote_storage( - tenant_shard_id, - ×tamp, - &done_if_after, - ) - .await - .map_err(|e| { - ApiError::InternalServerError(anyhow::anyhow!( - "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", - node - )) - })?; + .tenant_time_travel_remote_storage( + tenant_shard_id, + ×tamp, + &done_if_after, + ) + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", + node + )) + })?; } } Ok(()) @@ -2658,6 +2851,7 @@ impl Service { TenantOperations::TimelineCreate, ) .await; + failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock"); self.ensure_attached_wait(tenant_id).await?; @@ -2726,7 +2920,7 @@ impl Service { // Create timeline on remaining shards with number >0 if !targets.is_empty() { // If we had multiple shards, issue requests for the remainder now. - let jwt = self.config.jwt_token.clone(); + let jwt = &self.config.jwt_token; self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { let create_req = create_req.clone(); Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req)) @@ -2737,6 +2931,123 @@ impl Service { Ok(timeline_info) } + pub(crate) async fn tenant_timeline_detach_ancestor( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineDetachAncestor, + ) + .await; + + self.ensure_attached_wait(tenant_id).await?; + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let node_id = shard.intent.get_attached().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) + })?; + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + targets + }; + + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + + async fn detach_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { + tracing::info!( + "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); + + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + + client + .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .await + .map_err(|e| { + use mgmt_api::Error; + + match e { + // no ancestor (ever) + Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!( + "{node}: {}", + msg.strip_prefix("Conflict: ").unwrap_or(&msg) + )), + // too many ancestors + Error::ApiError(StatusCode::BAD_REQUEST, msg) => { + ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}")) + } + Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => { + // avoid turning these into conflicts to remain compatible with + // pageservers, 500 errors are sadly retryable with timeline ancestor + // detach + ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}")) + } + // rest can be mapped as usual + other => passthrough_api_error(&node, other), + } + }) + .map(|res| (tenant_shard_id.shard_number, res)) + } + + // no shard needs to go first/last; the operation should be idempotent + let mut results = self + .tenant_for_shards(targets, |tenant_shard_id, node| { + futures::FutureExt::boxed(detach_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + )) + }) + .await?; + + let any = results.pop().expect("we must have at least one response"); + + let mismatching = results + .iter() + .filter(|(_, res)| res != &any.1) + .collect::>(); + if !mismatching.is_empty() { + // this can be hit by races which should not happen because operation lock on cplane + let matching = results.len() - mismatching.len(); + tracing::error!( + matching, + compared_against=?any, + ?mismatching, + "shards returned different results" + ); + + return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required."))); + } + + Ok(any.1) + } + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// /// On success, the returned vector contains exactly the same number of elements as the input `locations`. @@ -2863,8 +3174,8 @@ impl Service { .await .map_err(|e| { ApiError::InternalServerError(anyhow::anyhow!( - "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", - )) + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + )) }) } @@ -3314,7 +3625,7 @@ impl Service { generation, &child_shard, &config, - matches!(policy, PlacementPolicy::Attached(n) if n > 0), + &policy, )), }, ); @@ -3816,6 +4127,8 @@ impl Service { "failpoint".to_string() ))); + failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel); + tracing::info!( "Split {} into {}", parent_id, @@ -4055,7 +4368,14 @@ impl Service { placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking // There is no way to know what the tenant's config was: revert to defaults - config: TenantConfig::default(), + // + // TODO: remove `switch_aux_file_policy` once we finish auxv2 migration + // + // we write to both v1+v2 storage, so that the test case can use either storage format for testing + config: TenantConfig { + switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation), + ..TenantConfig::default() + }, }) .await?; @@ -4192,8 +4512,6 @@ impl Service { /// This is for debug/support only: we simply drop all state for a tenant, without /// detaching or deleting it on pageservers. We do not try and re-schedule any /// tenants that were on this node. - /// - /// TODO: proper node deletion API that unhooks things more gracefully pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> { self.persistence.delete_node(node_id).await?; @@ -4201,6 +4519,7 @@ impl Service { for shard in locked.tenants.values_mut() { shard.deref_node(node_id); + shard.observed.locations.remove(&node_id); } let mut nodes = (*locked.nodes).clone(); @@ -4212,6 +4531,94 @@ impl Service { Ok(()) } + /// If a node has any work on it, it will be rescheduled: this is "clean" in the sense + /// that we don't leave any bad state behind in the storage controller, but unclean + /// in the sense that we are not carefully draining the node. + pub(crate) async fn node_delete(&self, node_id: NodeId) -> Result<(), ApiError> { + let _node_lock = + trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Delete).await; + + // 1. Atomically update in-memory state: + // - set the scheduling state to Pause to make subsequent scheduling ops skip it + // - update shards' intents to exclude the node, and reschedule any shards whose intents we modified. + // - drop the node from the main nodes map, so that when running reconciles complete they do not + // re-insert references to this node into the ObservedState of shards + // - drop the node from the scheduler + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + { + let mut nodes_mut = (*nodes).deref().clone(); + match nodes_mut.get_mut(&node_id) { + Some(node) => { + // We do not bother setting this in the database, because we're about to delete the row anyway, and + // if we crash it would not be desirable to leave the node paused after a restart. + node.set_scheduling(NodeSchedulingPolicy::Pause); + } + None => { + tracing::info!( + "Node not found: presuming this is a retry and returning success" + ); + return Ok(()); + } + } + + *nodes = Arc::new(nodes_mut); + } + + for (tenant_shard_id, shard) in tenants { + if shard.deref_node(node_id) { + // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise + // it won't properly do anti-affinity. + let mut schedule_context = ScheduleContext::default(); + + if let Err(e) = shard.schedule(scheduler, &mut schedule_context) { + // TODO: implement force flag to remove a node even if we can't reschedule + // a tenant + tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}"); + return Err(e.into()); + } else { + tracing::info!( + "Rescheduled shard {tenant_shard_id} away from node during deletion" + ) + } + + self.maybe_reconcile_shard(shard, nodes); + } + + // Here we remove an existing observed location for the node we're removing, and it will + // not be re-added by a reconciler's completion because we filter out removed nodes in + // process_result. + // + // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that + // means any reconciles we spawned will know about the node we're deleting, enabling them + // to do live migrations if it's still online. + shard.observed.locations.remove(&node_id); + } + + scheduler.node_remove(node_id); + + { + let mut nodes_mut = (**nodes).clone(); + nodes_mut.remove(&node_id); + *nodes = Arc::new(nodes_mut); + } + } + + // Note: some `generation_pageserver` columns on tenant shards in the database may still refer to + // the removed node, as this column means "The pageserver to which this generation was issued", and + // their generations won't get updated until the reconcilers moving them away from this node complete. + // That is safe because in Service::spawn we only use generation_pageserver if it refers to a node + // that exists. + + // 2. Actually delete the node from the database and from in-memory state + tracing::info!("Deleting node from database"); + self.persistence.delete_node(node_id).await?; + + Ok(()) + } + pub(crate) async fn node_list(&self) -> Result, ApiError> { let nodes = { self.inner @@ -4238,6 +4645,10 @@ impl Service { )) } + pub(crate) async fn get_leader(&self) -> DatabaseResult> { + self.persistence.get_leader().await + } + pub(crate) async fn node_register( &self, register_req: NodeRegisterRequest, @@ -4481,6 +4892,15 @@ impl Service { // TODO: in the background, we should balance work back onto this pageserver } + // No action required for the intermediate unavailable state. + // When we transition into active or offline from the unavailable state, + // the correct handling above will kick in. + AvailabilityTransition::ToWarmingUpFromActive => { + tracing::info!("Node {} transition to unavailable from active", node_id); + } + AvailabilityTransition::ToWarmingUpFromOffline => { + tracing::info!("Node {} transition to unavailable from offline", node_id); + } AvailabilityTransition::Unchanged => { tracing::debug!("Node {} no availability change during config", node_id); } @@ -4491,6 +4911,26 @@ impl Service { Ok(()) } + /// Wrapper around [`Self::node_configure`] which only allows changes while there is no ongoing + /// operation for HTTP api. + pub(crate) async fn external_node_configure( + &self, + node_id: NodeId, + availability: Option, + scheduling: Option, + ) -> Result<(), ApiError> { + { + let locked = self.inner.read().unwrap(); + if let Some(op) = locked.ongoing_operation.as_ref().map(|op| op.operation) { + return Err(ApiError::PreconditionFailed( + format!("Ongoing background operation forbids configuring: {op}").into(), + )); + } + } + + self.node_configure(node_id, availability, scheduling).await + } + pub(crate) async fn start_node_drain( self: &Arc, node_id: NodeId, @@ -4540,17 +4980,22 @@ impl Service { self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining)) .await?; - let cancel = CancellationToken::new(); + let cancel = self.cancel.child_token(); + let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?; self.inner.write().unwrap().ongoing_operation = Some(OperationHandler { operation: Operation::Drain(Drain { node_id }), cancel: cancel.clone(), }); + let span = tracing::info_span!(parent: None, "drain_node", %node_id); + tokio::task::spawn({ let service = self.clone(); let cancel = cancel.clone(); async move { + let _gate_guard = gate_guard; + scopeguard::defer! { let prev = service.inner.write().unwrap().ongoing_operation.take(); @@ -4561,21 +5006,21 @@ impl Service { } } - tracing::info!(%node_id, "Drain background operation starting"); + tracing::info!("Drain background operation starting"); let res = service.drain_node(node_id, cancel).await; match res { Ok(()) => { - tracing::info!(%node_id, "Drain background operation completed successfully"); + tracing::info!("Drain background operation completed successfully"); } Err(OperationError::Cancelled) => { - tracing::info!(%node_id, "Drain background operation was cancelled"); + tracing::info!("Drain background operation was cancelled"); } Err(err) => { - tracing::error!(%node_id, "Drain background operation encountered: {err}") + tracing::error!("Drain background operation encountered: {err}") } } } - }); + }.instrument(span)); } NodeSchedulingPolicy::Draining => { return Err(ApiError::Conflict(format!( @@ -4592,6 +5037,38 @@ impl Service { Ok(()) } + pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> { + let node_available = { + let locked = self.inner.read().unwrap(); + let nodes = &locked.nodes; + let node = nodes.get(&node_id).ok_or(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + ))?; + + node.is_available() + }; + + if !node_available { + return Err(ApiError::ResourceUnavailable( + format!("Node {node_id} is currently unavailable").into(), + )); + } + + if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() { + if let Operation::Drain(drain) = op_handler.operation { + if drain.node_id == node_id { + tracing::info!("Cancelling background drain operation for node {node_id}"); + op_handler.cancel.cancel(); + return Ok(()); + } + } + } + + Err(ApiError::PreconditionFailed( + format!("Node {node_id} has no drain in progress").into(), + )) + } + pub(crate) async fn start_node_fill(self: &Arc, node_id: NodeId) -> Result<(), ApiError> { let (ongoing_op, node_available, node_policy, total_nodes_count) = { let locked = self.inner.read().unwrap(); @@ -4634,17 +5111,22 @@ impl Service { self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling)) .await?; - let cancel = CancellationToken::new(); + let cancel = self.cancel.child_token(); + let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?; self.inner.write().unwrap().ongoing_operation = Some(OperationHandler { operation: Operation::Fill(Fill { node_id }), cancel: cancel.clone(), }); + let span = tracing::info_span!(parent: None, "fill_node", %node_id); + tokio::task::spawn({ let service = self.clone(); let cancel = cancel.clone(); async move { + let _gate_guard = gate_guard; + scopeguard::defer! { let prev = service.inner.write().unwrap().ongoing_operation.take(); @@ -4655,21 +5137,21 @@ impl Service { } } - tracing::info!(%node_id, "Fill background operation starting"); + tracing::info!("Fill background operation starting"); let res = service.fill_node(node_id, cancel).await; match res { Ok(()) => { - tracing::info!(%node_id, "Fill background operation completed successfully"); + tracing::info!("Fill background operation completed successfully"); } Err(OperationError::Cancelled) => { - tracing::info!(%node_id, "Fill background operation was cancelled"); + tracing::info!("Fill background operation was cancelled"); } Err(err) => { - tracing::error!(%node_id, "Fill background operation encountered: {err}") + tracing::error!("Fill background operation encountered: {err}") } } } - }); + }.instrument(span)); } NodeSchedulingPolicy::Filling => { return Err(ApiError::Conflict(format!( @@ -4686,6 +5168,38 @@ impl Service { Ok(()) } + pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> { + let node_available = { + let locked = self.inner.read().unwrap(); + let nodes = &locked.nodes; + let node = nodes.get(&node_id).ok_or(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + ))?; + + node.is_available() + }; + + if !node_available { + return Err(ApiError::ResourceUnavailable( + format!("Node {node_id} is currently unavailable").into(), + )); + } + + if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() { + if let Operation::Fill(fill) = op_handler.operation { + if fill.node_id == node_id { + tracing::info!("Cancelling background drain operation for node {node_id}"); + op_handler.cancel.cancel(); + return Ok(()); + } + } + } + + Err(ApiError::PreconditionFailed( + format!("Node {node_id} has no fill in progress").into(), + )) + } + /// Helper for methods that will try and call pageserver APIs for /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant /// is attached somewhere. @@ -4752,11 +5266,22 @@ impl Service { Ok(()) } - /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], + /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler + /// configuration fn maybe_reconcile_shard( &self, shard: &mut TenantShard, nodes: &Arc>, + ) -> Option { + self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default()) + } + + /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], + fn maybe_configured_reconcile_shard( + &self, + shard: &mut TenantShard, + nodes: &Arc>, + reconciler_config: ReconcilerConfig, ) -> Option { let reconcile_needed = shard.get_reconcile_needed(nodes); @@ -4797,7 +5322,7 @@ impl Service { } }; - let Ok(gate_guard) = self.gate.enter() else { + let Ok(gate_guard) = self.reconcilers_gate.enter() else { // Gate closed: we're shutting down, drop out. return None; }; @@ -4806,11 +5331,12 @@ impl Service { &self.result_tx, nodes, &self.compute_hook, + reconciler_config, &self.config, &self.persistence, units, gate_guard, - &self.cancel, + &self.reconcilers_cancel, ) } @@ -4863,7 +5389,7 @@ impl Service { /// we did the split, but are probably better placed elsewhere. /// - Creating new secondary locations if it improves the spreading of a sharded tenant /// * e.g. after a shard split, some locations will be on the same node (where the split - /// happened), and will probably be better placed elsewhere. + /// happened), and will probably be better placed elsewhere. /// /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at /// the time of scheduling, this function looks for cases where a better-scoring location is available @@ -5256,108 +5782,239 @@ impl Service { Ok(std::cmp::max(waiter_count, reconciles_spawned)) } + async fn stop_reconciliations(&self, reason: StopReconciliationsReason) { + // Cancel all on-going reconciles and wait for them to exit the gate. + tracing::info!("{reason}: cancelling and waiting for in-flight reconciles"); + self.reconcilers_cancel.cancel(); + self.reconcilers_gate.close().await; + + // Signal the background loop in [`Service::process_results`] to exit once + // it has proccessed the results from all the reconciles we cancelled earlier. + tracing::info!("{reason}: processing results from previously in-flight reconciles"); + self.result_tx.send(ReconcileResultRequest::Stop).ok(); + self.result_tx.closed().await; + } + pub async fn shutdown(&self) { - // Note that this already stops processing any results from reconciles: so - // we do not expect that our [`TenantShard`] objects will reach a neat - // final state. + self.stop_reconciliations(StopReconciliationsReason::ShuttingDown) + .await; + + // Background tasks hold gate guards: this notifies them of the cancellation and + // waits for them all to complete. + tracing::info!("Shutting down: cancelling and waiting for background tasks to exit"); self.cancel.cancel(); - - // The cancellation tokens in [`crate::reconciler::Reconciler`] are children - // of our cancellation token, so we do not need to explicitly cancel each of - // them. - - // Background tasks and reconcilers hold gate guards: this waits for them all - // to complete. self.gate.close().await; } + /// Spot check the download lag for a secondary location of a shard. + /// Should be used as a heuristic, since it's not always precise: the + /// secondary might have not downloaded the new heat map yet and, hence, + /// is not aware of the lag. + /// + /// Returns: + /// * Ok(None) if the lag could not be determined from the status, + /// * Ok(Some(_)) if the lag could be determind + /// * Err on failures to query the pageserver. + async fn secondary_lag( + &self, + secondary: &NodeId, + tenant_shard_id: TenantShardId, + ) -> Result, mgmt_api::Error> { + let nodes = self.inner.read().unwrap().nodes.clone(); + let node = nodes.get(secondary).ok_or(mgmt_api::Error::ApiError( + StatusCode::NOT_FOUND, + format!("Node with id {} not found", secondary), + ))?; + + match node + .with_client_retries( + |client| async move { client.tenant_secondary_status(tenant_shard_id).await }, + &self.config.jwt_token, + 1, + 3, + Duration::from_millis(250), + &self.cancel, + ) + .await + { + Some(Ok(status)) => match status.heatmap_mtime { + Some(_) => Ok(Some(status.bytes_total - status.bytes_downloaded)), + None => Ok(None), + }, + Some(Err(e)) => Err(e), + None => Err(mgmt_api::Error::Cancelled), + } + } + /// Drain a node by moving the shards attached to it as primaries. /// This is a long running operation and it should run as a separate Tokio task. pub(crate) async fn drain_node( - &self, + self: &Arc, node_id: NodeId, cancel: CancellationToken, ) -> Result<(), OperationError> { - let mut last_inspected_shard: Option = None; - let mut inspected_all_shards = false; - let mut waiters = Vec::new(); - let mut schedule_context = ScheduleContext::default(); + const MAX_SECONDARY_LAG_BYTES_DEFAULT: u64 = 256 * 1024 * 1024; + let max_secondary_lag_bytes = self + .config + .max_secondary_lag_bytes + .unwrap_or(MAX_SECONDARY_LAG_BYTES_DEFAULT); - while !inspected_all_shards { + // By default, live migrations are generous about the wait time for getting + // the secondary location up to speed. When draining, give up earlier in order + // to not stall the operation when a cold secondary is encountered. + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); + let reconciler_config = ReconcilerConfigBuilder::new() + .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) + .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) + .build(); + + let mut waiters = Vec::new(); + + let mut tid_iter = TenantShardIterator::new({ + let service = self.clone(); + move |last_inspected_shard: Option| { + let locked = &service.inner.read().unwrap(); + let tenants = &locked.tenants; + let entry = match last_inspected_shard { + Some(skip_past) => { + // Skip to the last seen tenant shard id + let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past); + + // Skip past the last seen + cursor.nth(1) + } + None => tenants.first_key_value(), + }; + + entry.map(|(tid, _)| tid).copied() + } + }); + + while !tid_iter.finished() { if cancel.is_cancelled() { - return Err(OperationError::Cancelled); + match self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) + .await + { + Ok(()) => return Err(OperationError::Cancelled), + Err(err) => { + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", + node_id, err + ) + .into(), + )); + } + } } - { - let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); + drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?; - let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged( - format!("node {node_id} was removed").into(), - ))?; + while waiters.len() < MAX_RECONCILES_PER_OPERATION { + let tid = match tid_iter.next() { + Some(tid) => tid, + None => { + break; + } + }; - let current_policy = node.get_scheduling(); - if !matches!(current_policy, NodeSchedulingPolicy::Draining) { - // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think - // about it - return Err(OperationError::NodeStateChanged( - format!("node {node_id} changed state to {current_policy:?}").into(), - )); + let tid_drain = TenantShardDrain { + drained_node: node_id, + tenant_shard_id: tid, + }; + + let dest_node_id = { + let locked = self.inner.read().unwrap(); + + match tid_drain + .tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler) + { + Some(node_id) => node_id, + None => { + continue; + } + } + }; + + match self.secondary_lag(&dest_node_id, tid).await { + Ok(Some(lag)) if lag <= max_secondary_lag_bytes => { + // The secondary is reasonably up to date. + // Migrate to it + } + Ok(Some(lag)) => { + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Secondary on node {dest_node_id} is lagging by {lag}. Skipping reconcile." + ); + continue; + } + Ok(None) => { + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Could not determine lag for secondary on node {dest_node_id}. Skipping reconcile." + ); + continue; + } + Err(err) => { + tracing::warn!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Failed to get secondary lag from node {dest_node_id}. Skipping reconcile: {err}" + ); + continue; + } } - let mut cursor = tenants.iter_mut().skip_while({ - let skip_past = last_inspected_shard; - move |(tid, _)| match skip_past { - Some(last) => **tid != last, - None => false, - } - }); + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + let rescheduled = tid_drain.reschedule_to_secondary( + dest_node_id, + tenants, + scheduler, + nodes, + )?; - while waiters.len() < MAX_RECONCILES_PER_OPERATION { - let (tid, tenant_shard) = match cursor.next() { - Some(some) => some, - None => { - inspected_all_shards = true; - break; - } - }; - - if tenant_shard.intent.demote_attached(scheduler, node_id) { - match tenant_shard.schedule(scheduler, &mut schedule_context) { - Err(e) => { - tracing::warn!( - tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), - "Scheduling error when draining pageserver {} : {e}", node_id - ); - } - Ok(()) => { - let scheduled_to = tenant_shard.intent.get_attached(); - tracing::info!( - tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), - "Rescheduled shard while draining node {}: {} -> {:?}", - node_id, - node_id, - scheduled_to - ); - - let waiter = self.maybe_reconcile_shard(tenant_shard, nodes); - if let Some(some) = waiter { - waiters.push(some); - } - } + if let Some(tenant_shard) = rescheduled { + let waiter = self.maybe_configured_reconcile_shard( + tenant_shard, + nodes, + reconciler_config, + ); + if let Some(some) = waiter { + waiters.push(some); } } - - last_inspected_shard = Some(*tid); } } waiters = self .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) .await; + + failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel); } while !waiters.is_empty() { + if cancel.is_cancelled() { + match self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) + .await + { + Ok(()) => return Err(OperationError::Cancelled), + Err(err) => { + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", + node_id, err + ) + .into(), + )); + } + } + } + tracing::info!("Awaiting {} pending drain reconciliations", waiters.len()); waiters = self @@ -5389,11 +6046,14 @@ impl Service { /// Create a node fill plan (pick secondaries to promote) that meets the following requirements: /// 1. The node should be filled until it reaches the expected cluster average of - /// attached shards. If there are not enough secondaries on the node, the plan stops early. + /// attached shards. If there are not enough secondaries on the node, the plan stops early. /// 2. Select tenant shards to promote such that the number of attached shards is balanced - /// throughout the cluster. We achieve this by picking tenant shards from each node, - /// starting from the ones with the largest number of attached shards, until the node - /// reaches the expected cluster average. + /// throughout the cluster. We achieve this by picking tenant shards from each node, + /// starting from the ones with the largest number of attached shards, until the node + /// reaches the expected cluster average. + /// 3. Avoid promoting more shards of the same tenant than required. The upper bound + /// for the number of tenants from the same shard promoted to the node being filled is: + /// shard count for the tenant divided by the number of nodes in the cluster. fn fill_node_plan(&self, node_id: NodeId) -> Vec { let mut locked = self.inner.write().unwrap(); let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); @@ -5415,8 +6075,18 @@ impl Service { let expected_attached = locked.scheduler.expected_attached_shard_count(); let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count(); + let mut promoted_per_tenant: HashMap = HashMap::new(); let mut plan = Vec::new(); + for (node_id, attached) in nodes_by_load { + let available = locked + .nodes + .get(&node_id) + .map_or(false, |n| n.is_available()); + if !available { + continue; + } + if plan.len() >= fill_requirement || tids_by_node.is_empty() || attached <= expected_attached @@ -5425,12 +6095,24 @@ impl Service { } let can_take = attached - expected_attached; + let needed = fill_requirement - plan.len(); + let mut take = std::cmp::min(can_take, needed); + let mut remove_node = false; - for _ in 0..can_take { + while take > 0 { match tids_by_node.get_mut(&node_id) { Some(tids) => match tids.pop() { Some(tid) => { - plan.push(tid); + let max_promote_for_tenant = std::cmp::max( + tid.shard_count.count() as usize / locked.nodes.len(), + 1, + ); + let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default(); + if *promoted < max_promote_for_tenant { + plan.push(tid); + *promoted += 1; + take -= 1; + } } None => { remove_node = true; @@ -5464,15 +6146,27 @@ impl Service { // secondaries are warm. This is not always true (e.g. we just migrated the // tenant). Take that into consideration by checking the secondary status. let mut tids_to_promote = self.fill_node_plan(node_id); - let mut waiters = Vec::new(); - let mut schedule_context = ScheduleContext::default(); // Execute the plan we've composed above. Before aplying each move from the plan, // we validate to ensure that it has not gone stale in the meantime. while !tids_to_promote.is_empty() { if cancel.is_cancelled() { - return Err(OperationError::Cancelled); + match self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) + .await + { + Ok(()) => return Err(OperationError::Cancelled), + Err(err) => { + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", + node_id, err + ) + .into(), + )); + } + } } { @@ -5502,9 +6196,7 @@ impl Service { } let previously_attached_to = *tenant_shard.intent.get_attached(); - - tenant_shard.intent.promote_attached(scheduler, node_id); - match tenant_shard.schedule(scheduler, &mut schedule_context) { + match tenant_shard.reschedule_to_secondary(Some(node_id), scheduler) { Err(e) => { tracing::warn!( tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), @@ -5540,6 +6232,24 @@ impl Service { } while !waiters.is_empty() { + if cancel.is_cancelled() { + match self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) + .await + { + Ok(()) => return Err(OperationError::Cancelled), + Err(err) => { + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", + node_id, err + ) + .into(), + )); + } + } + } + tracing::info!("Awaiting {} pending fill reconciliations", waiters.len()); waiters = self @@ -5562,4 +6272,90 @@ impl Service { Ok(()) } + + /// Updates scrubber metadata health check results. + pub(crate) async fn metadata_health_update( + &self, + update_req: MetadataHealthUpdateRequest, + ) -> Result<(), ApiError> { + let now = chrono::offset::Utc::now(); + let (healthy_records, unhealthy_records) = { + let locked = self.inner.read().unwrap(); + let healthy_records = update_req + .healthy_tenant_shards + .into_iter() + // Retain only health records associated with tenant shards managed by storage controller. + .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id)) + .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now)) + .collect(); + let unhealthy_records = update_req + .unhealthy_tenant_shards + .into_iter() + .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id)) + .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now)) + .collect(); + + (healthy_records, unhealthy_records) + }; + + self.persistence + .update_metadata_health_records(healthy_records, unhealthy_records, now) + .await?; + Ok(()) + } + + /// Lists the tenant shards that has unhealthy metadata status. + pub(crate) async fn metadata_health_list_unhealthy( + &self, + ) -> Result, ApiError> { + let result = self + .persistence + .list_unhealthy_metadata_health_records() + .await? + .iter() + .map(|p| p.get_tenant_shard_id().unwrap()) + .collect(); + + Ok(result) + } + + /// Lists the tenant shards that have not been scrubbed for some duration. + pub(crate) async fn metadata_health_list_outdated( + &self, + not_scrubbed_for: Duration, + ) -> Result, ApiError> { + let earlier = chrono::offset::Utc::now() - not_scrubbed_for; + let result = self + .persistence + .list_outdated_metadata_health_records(earlier) + .await? + .into_iter() + .map(|record| record.into()) + .collect(); + Ok(result) + } + + pub(crate) fn get_leadership_status(&self) -> LeadershipStatus { + self.inner.read().unwrap().get_leadership_status() + } + + pub(crate) async fn step_down(&self) -> GlobalObservedState { + tracing::info!("Received step down request from peer"); + failpoint_support::sleep_millis_async!("sleep-on-step-down-handling"); + + self.inner.write().unwrap().step_down(); + // TODO: would it make sense to have a time-out for this? + self.stop_reconciliations(StopReconciliationsReason::SteppingDown) + .await; + + let mut global_observed = GlobalObservedState::default(); + let locked = self.inner.read().unwrap(); + for (tid, tenant_shard) in locked.tenants.iter() { + global_observed + .0 + .insert(*tid, tenant_shard.observed.clone()); + } + + global_observed + } } diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs new file mode 100644 index 0000000000..99961d691c --- /dev/null +++ b/storage_controller/src/service/chaos_injector.rs @@ -0,0 +1,71 @@ +use std::{sync::Arc, time::Duration}; + +use rand::seq::SliceRandom; +use rand::thread_rng; +use tokio_util::sync::CancellationToken; + +use super::Service; + +pub struct ChaosInjector { + service: Arc, + interval: Duration, +} + +impl ChaosInjector { + pub fn new(service: Arc, interval: Duration) -> Self { + Self { service, interval } + } + + pub async fn run(&mut self, cancel: CancellationToken) { + let mut interval = tokio::time::interval(self.interval); + + loop { + tokio::select! { + _ = interval.tick() => {} + _ = cancel.cancelled() => { + tracing::info!("Shutting down"); + return; + } + } + + self.inject_chaos().await; + + tracing::info!("Chaos iteration..."); + } + } + + async fn inject_chaos(&mut self) { + // Pick some shards to interfere with + let batch_size = 128; + let mut inner = self.service.inner.write().unwrap(); + let (nodes, tenants, scheduler) = inner.parts_mut(); + let tenant_ids = tenants.keys().cloned().collect::>(); + let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size); + + for victim in victims { + let shard = tenants + .get_mut(victim) + .expect("Held lock between choosing ID and this get"); + + // Pick a secondary to promote + let Some(new_location) = shard + .intent + .get_secondary() + .choose(&mut thread_rng()) + .cloned() + else { + tracing::info!("Skipping shard {victim}: no secondary location, can't migrate"); + continue; + }; + + let Some(old_location) = *shard.intent.get_attached() else { + tracing::info!("Skipping shard {victim}: currently has no attached location"); + continue; + }; + + shard.intent.demote_attached(scheduler, old_location); + shard.intent.promote_attached(scheduler, new_location); + self.service.maybe_reconcile_shard(shard, nodes); + } + } +} diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index d1b632755f..1fcc3c8547 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -7,8 +7,9 @@ use std::{ use crate::{ metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, persistence::TenantShardPersistence, - reconciler::ReconcileUnits, + reconciler::{ReconcileUnits, ReconcilerConfig}, scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext}, + service::ReconcileResultRequest, }; use pageserver_api::controller_api::{ NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, @@ -17,7 +18,7 @@ use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, shard::{ShardIdentity, TenantShardId}, }; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::{instrument, Instrument}; @@ -124,6 +125,7 @@ pub(crate) struct TenantShard { /// - ReconcileWaiters need to Arc-clone the overall object to read it later /// - ReconcileWaitError needs to use an `Arc` because we can construct /// many waiters for one shard, and the underlying error types are not Clone. + /// /// TODO: generalize to an array of recent events /// TOOD: use a ArcSwap instead of mutex for faster reads? #[serde(serialize_with = "read_last_error")] @@ -282,7 +284,7 @@ impl Drop for IntentState { } } -#[derive(Default, Clone, Serialize)] +#[derive(Default, Clone, Serialize, Deserialize, Debug)] pub(crate) struct ObservedState { pub(crate) locations: HashMap, } @@ -296,7 +298,7 @@ pub(crate) struct ObservedState { /// what it is (e.g. we failed partway through configuring it) /// * Instance exists with conf==Some: this tells us what we last successfully configured on this node, /// and that configuration will still be present unless something external interfered. -#[derive(Clone, Serialize)] +#[derive(Clone, Serialize, Deserialize, Debug)] pub(crate) struct ObservedStateLocation { /// If None, it means we do not know the status of this shard's location on this node, but /// we know that we might have some state on this node. @@ -383,9 +385,9 @@ impl ReconcilerWaiter { } pub(crate) fn get_status(&self) -> ReconcilerStatus { - if self.seq_wait.would_wait_for(self.seq).is_err() { + if self.seq_wait.would_wait_for(self.seq).is_ok() { ReconcilerStatus::Done - } else if self.error_seq_wait.would_wait_for(self.seq).is_err() { + } else if self.error_seq_wait.would_wait_for(self.seq).is_ok() { ReconcilerStatus::Failed } else { ReconcilerStatus::InProgress @@ -646,6 +648,48 @@ impl TenantShard { Ok(()) } + /// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error + /// if the swap is not possible and leaves the intent state in its original state. + /// + /// Arguments: + /// `attached_to`: the currently attached location matching the intent state (may be None if the + /// shard is not attached) + /// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask + /// the scheduler to recommend a node + pub(crate) fn reschedule_to_secondary( + &mut self, + promote_to: Option, + scheduler: &mut Scheduler, + ) -> Result<(), ScheduleError> { + let promote_to = match promote_to { + Some(node) => node, + None => match scheduler.node_preferred(self.intent.get_secondary()) { + Some(node) => node, + None => { + return Err(ScheduleError::ImpossibleConstraint); + } + }, + }; + + assert!(self.intent.get_secondary().contains(&promote_to)); + + if let Some(node) = self.intent.get_attached() { + let demoted = self.intent.demote_attached(scheduler, *node); + if !demoted { + return Err(ScheduleError::ImpossibleConstraint); + } + } + + self.intent.promote_attached(scheduler, promote_to); + + // Increment the sequence number for the edge case where a + // reconciler is already running to avoid waiting on the + // current reconcile instead of spawning a new one. + self.sequence = self.sequence.next(); + + Ok(()) + } + /// Optimize attachments: if a shard has a secondary location that is preferable to /// its primary location based on soft constraints, switch that secondary location /// to be attached. @@ -866,12 +910,8 @@ impl TenantShard { .generation .expect("Attempted to enter attached state without a generation"); - let wanted_conf = attached_location_conf( - generation, - &self.shard, - &self.config, - !self.intent.secondary.is_empty(), - ); + let wanted_conf = + attached_location_conf(generation, &self.shard, &self.config, &self.policy); match self.observed.locations.get(&node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { @@ -1020,9 +1060,10 @@ impl TenantShard { #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( &mut self, - result_tx: &tokio::sync::mpsc::UnboundedSender, + result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, + reconciler_config: ReconcilerConfig, service_config: &service::Config, persistence: &Arc, units: ReconcileUnits, @@ -1057,9 +1098,11 @@ impl TenantShard { let mut reconciler = Reconciler { tenant_shard_id: self.tenant_shard_id, shard: self.shard, + placement_policy: self.policy.clone(), generation: self.generation, intent: reconciler_intent, detach, + reconciler_config, config: self.config.clone(), observed: self.observed.clone(), compute_hook: compute_hook.clone(), @@ -1143,7 +1186,9 @@ impl TenantShard { pending_compute_notification: reconciler.compute_notify_failure, }; - result_tx.send(result).ok(); + result_tx + .send(ReconcileResultRequest::ReconcileResult(result)) + .ok(); } .instrument(reconciler_span), ); @@ -1190,18 +1235,27 @@ impl TenantShard { } } - // If we had any state at all referring to this node ID, drop it. Does not - // attempt to reschedule. - pub(crate) fn deref_node(&mut self, node_id: NodeId) { + /// If we had any state at all referring to this node ID, drop it. Does not + /// attempt to reschedule. + /// + /// Returns true if we modified the node's intent state. + pub(crate) fn deref_node(&mut self, node_id: NodeId) -> bool { + let mut intent_modified = false; + + // Drop if this node was our attached intent if self.intent.attached == Some(node_id) { self.intent.attached = None; + intent_modified = true; } + // Drop from the list of secondaries, and check if we modified it + let had_secondaries = self.intent.secondary.len(); self.intent.secondary.retain(|n| n != &node_id); - - self.observed.locations.remove(&node_id); + intent_modified |= self.intent.secondary.len() != had_secondaries; debug_assert!(!self.intent.all_pageservers().contains(&node_id)); + + intent_modified } pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) { @@ -1632,14 +1686,10 @@ pub(crate) mod tests { // We should see equal number of locations on the two nodes. assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4); - // Scheduling does not consider the number of attachments picking the initial - // pageserver to attach to (hence the assertion that all primaries are on the - // same node) - // TODO: Tweak the scheduling to evenly distribute attachments for new shards. - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2); assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2); // Add another two nodes: we should see the shards spread out when their optimize // methods are called diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index 050be66483..d19119990b 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -10,6 +10,7 @@ aws-smithy-async.workspace = true either.workspace = true tokio-rustls.workspace = true anyhow.workspace = true +git-version.workspace = true hex.workspace = true humantime.workspace = true thiserror.workspace = true @@ -34,6 +35,7 @@ camino.workspace = true rustls.workspace = true rustls-native-certs.workspace = true once_cell.workspace = true +storage_controller_client.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } chrono = { workspace = true, default-features = false, features = ["clock", "serde"] } @@ -48,6 +50,5 @@ tracing.workspace = true tracing-subscriber.workspace = true clap.workspace = true tracing-appender = "0.2" -histogram = "0.7" futures.workspace = true diff --git a/storage_scrubber/README.md b/storage_scrubber/README.md index 0930f343ec..9fbd92feef 100644 --- a/storage_scrubber/README.md +++ b/storage_scrubber/README.md @@ -45,7 +45,11 @@ processing by the `purge-garbage` subcommand. Example: -`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=[client_key] CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` + +Note that `CLOUD_ADMIN_API_TOKEN` can be obtained from https://console-stage.neon.build/app/settings/api-keys (for staging) or https://console.neon.tech/app/settings/api-keys for production. This is not the control plane admin JWT key. The env var name is confusing. Though anyone can generate that API key, you still need admin permission in order to access all projects in the region. + +And note that `CLOUD_ADMIN_API_URL` should include the region in the admin URL due to the control plane / console split. For example, `https://console-stage.neon.build/regions/aws-us-east-2/api/v1/admin` for the staging us-east-2 region. #### `purge-garbage` @@ -61,7 +65,7 @@ to pass them on the command line Example: -`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev cargo run --release -- purge-garbage --input-path=eu-west-1-garbage.json` Add the `--delete` argument before `purge-garbage` to enable deletion. This is intentionally not provided inline in the example above to avoid accidents. Without the `--delete` flag diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 4eb8580e32..b35838bcf7 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -1,9 +1,10 @@ use std::collections::{HashMap, HashSet}; use anyhow::Context; -use aws_sdk_s3::Client; +use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver_api::shard::ShardIndex; +use tokio_util::sync::CancellationToken; use tracing::{error, info, warn}; use utils::generation::Generation; use utils::id::TimelineId; @@ -12,10 +13,10 @@ use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; use futures_util::StreamExt; -use pageserver::tenant::remote_timeline_client::parse_remote_index_path; +use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; -use remote_storage::RemotePath; +use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; pub(crate) struct TimelineAnalysis { /// Anomalies detected @@ -39,14 +40,20 @@ impl TimelineAnalysis { garbage_keys: Vec::new(), } } + + /// Whether a timeline is healthy. + pub(crate) fn is_healthy(&self) -> bool { + self.errors.is_empty() && self.warnings.is_empty() + } } -pub(crate) fn branch_cleanup_and_check_errors( +pub(crate) async fn branch_cleanup_and_check_errors( + remote_client: &GenericRemoteStorage, id: &TenantShardTimelineId, tenant_objects: &mut TenantObjectListing, s3_active_branch: Option<&BranchData>, console_branch: Option, - s3_data: Option, + s3_data: Option, ) -> TimelineAnalysis { let mut result = TimelineAnalysis::new(); @@ -70,7 +77,9 @@ pub(crate) fn branch_cleanup_and_check_errors( match s3_data { Some(s3_data) => { - result.garbage_keys.extend(s3_data.unknown_keys); + result + .garbage_keys + .extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string())); match s3_data.blob_data { BlobDataParseResult::Parsed { @@ -84,16 +93,19 @@ pub(crate) fn branch_cleanup_and_check_errors( .push(format!("index_part.json version: {}", index_part.version())) } - if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() { - result.warnings.push(format!( + let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3); + if !newest_versions.any(|ip| ip == &index_part.version()) { + info!( "index_part.json version is not latest: {}", index_part.version() - )) + ); } if index_part.metadata.disk_consistent_lsn() != index_part.duplicated_disk_consistent_lsn() { + // Tech debt: let's get rid of one of these, they are redundant + // https://github.com/neondatabase/neon/issues/8343 result.errors.push(format!( "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})", index_part.metadata.disk_consistent_lsn(), @@ -102,8 +114,16 @@ pub(crate) fn branch_cleanup_and_check_errors( } if index_part.layer_metadata.is_empty() { - // not an error, can happen for branches with zero writes, but notice that - info!("index_part.json has no layers"); + if index_part.metadata.ancestor_timeline().is_none() { + // The initial timeline with no ancestor should ALWAYS have layers. + result.errors.push( + "index_part.json has no layers (ancestor_timeline=None)" + .to_string(), + ); + } else { + // Not an error, can happen for branches with zero writes, but notice that + info!("index_part.json has no layers (ancestor_timeline exists)"); + } } for (layer, metadata) in index_part.layer_metadata { @@ -114,22 +134,47 @@ pub(crate) fn branch_cleanup_and_check_errors( } if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) { - // FIXME: this will emit false positives if an index was - // uploaded concurrently with our scan. To make this check - // correct, we need to try sending a HEAD request for the - // layer we think is missing. - result.errors.push(format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage", - layer, - metadata.generation.get_suffix(), - metadata.shard - )) + let path = remote_layer_path( + &id.tenant_shard_id.tenant_id, + &id.timeline_id, + metadata.shard, + &layer, + metadata.generation, + ); + + // HEAD request used here to address a race condition when an index was uploaded concurrently + // with our scan. We check if the object is uploaded to S3 after taking the listing snapshot. + let response = remote_client + .head_object(&path, &CancellationToken::new()) + .await; + + if response.is_err() { + // Object is not present. + let is_l0 = LayerMap::is_l0(layer.key_range()); + + let msg = format!( + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + layer, + metadata.generation.get_suffix(), + metadata.shard, + is_l0, + ); + + if is_l0 { + result.warnings.push(msg); + } else { + result.errors.push(msg); + } + } } } } BlobDataParseResult::Relic => {} - BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( - parse_errors + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => result.errors.extend( + errors .into_iter() .map(|error| format!("parse error: {error}")), ), @@ -237,14 +282,14 @@ impl TenantObjectListing { } #[derive(Debug)] -pub(crate) struct S3TimelineBlobData { +pub(crate) struct RemoteTimelineBlobData { pub(crate) blob_data: BlobDataParseResult, // Index objects that were not used when loading `blob_data`, e.g. those from old generations - pub(crate) unused_index_keys: Vec, + pub(crate) unused_index_keys: Vec, // Objects whose keys were not recognized at all, i.e. not layer files, not indices - pub(crate) unknown_keys: Vec, + pub(crate) unknown_keys: Vec, } #[derive(Debug)] @@ -256,10 +301,13 @@ pub(crate) enum BlobDataParseResult { }, /// The remains of a deleted Timeline (i.e. an initdb archive only) Relic, - Incorrect(Vec), + Incorrect { + errors: Vec, + s3_layers: HashSet<(LayerName, Generation)>, + }, } -fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { +pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? Some((layer_filename, gen)) if gen.len() == 8 => { @@ -273,39 +321,48 @@ fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String } pub(crate) async fn list_timeline_blobs( - s3_client: &Client, + remote_client: &GenericRemoteStorage, id: TenantShardTimelineId, - s3_root: &RootTarget, -) -> anyhow::Result { + root_target: &RootTarget, +) -> anyhow::Result { let mut s3_layers = HashSet::new(); let mut errors = Vec::new(); let mut unknown_keys = Vec::new(); - let mut timeline_dir_target = s3_root.timeline_root(&id); + let mut timeline_dir_target = root_target.timeline_root(&id); timeline_dir_target.delimiter = String::new(); - let mut index_part_keys: Vec = Vec::new(); + let mut index_part_keys: Vec = Vec::new(); let mut initdb_archive: bool = false; - let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); - while let Some(obj) = stream.next().await { - let obj = obj?; - let key = obj.key(); + let prefix_str = &timeline_dir_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&timeline_dir_target.prefix_in_bucket); - let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket); + let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target)); + while let Some(obj) = stream.next().await { + let (key, Some(obj)) = obj? else { + panic!("ListingObject not specified"); + }; + + let blob_name = key.get_path().as_str().strip_prefix(prefix_str); match blob_name { Some(name) if name.starts_with("index_part.json") => { tracing::debug!("Index key {key}"); - index_part_keys.push(key.to_owned()) + index_part_keys.push(obj) } Some("initdb.tar.zst") => { tracing::debug!("initdb archive {key}"); initdb_archive = true; } + Some("initdb-preserved.tar.zst") => { + tracing::info!("initdb archive preserved {key}"); + } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { Ok((new_layer, gen)) => { - tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen); + tracing::debug!("Parsed layer key: {new_layer} {gen:?}"); s3_layers.insert((new_layer, gen)); } Err(e) => { @@ -313,13 +370,13 @@ pub(crate) async fn list_timeline_blobs( errors.push( format!("S3 list response got an object with key {key} that is not a layer name: {e}"), ); - unknown_keys.push(key.to_string()); + unknown_keys.push(obj); } }, None => { - tracing::warn!("Unknown key {}", key); + tracing::warn!("Unknown key {key}"); errors.push(format!("S3 list response got an object with odd key {key}")); - unknown_keys.push(key.to_string()); + unknown_keys.push(obj); } } } @@ -328,7 +385,7 @@ pub(crate) async fn list_timeline_blobs( tracing::debug!( "Timeline is empty apart from initdb archive: expected post-deletion state." ); - return Ok(S3TimelineBlobData { + return Ok(RemoteTimelineBlobData { blob_data: BlobDataParseResult::Relic, unused_index_keys: index_part_keys, unknown_keys: Vec::new(), @@ -342,13 +399,13 @@ pub(crate) async fn list_timeline_blobs( // Stripping the index key to the last part, because RemotePath doesn't // like absolute paths, and depending on prefix_in_bucket it's possible // for the keys we read back to start with a slash. - let basename = key.rsplit_once('/').unwrap().1; + let basename = key.key.get_path().as_str().rsplit_once('/').unwrap().1; parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g)) }) .max_by_key(|i| i.1) .map(|(k, g)| (k.clone(), g)) { - Some((key, gen)) => (Some(key), gen), + Some((key, gen)) => (Some::(key.to_owned()), gen), None => { // Legacy/missing case: one or zero index parts, which did not have a generation (index_part_keys.pop(), Generation::none()) @@ -363,17 +420,14 @@ pub(crate) async fn list_timeline_blobs( } if let Some(index_part_object_key) = index_part_object.as_ref() { - let index_part_bytes = download_object_with_retries( - s3_client, - &timeline_dir_target.bucket_name, - index_part_object_key, - ) - .await - .context("index_part.json download")?; + let index_part_bytes = + download_object_with_retries(remote_client, &index_part_object_key.key) + .await + .context("index_part.json download")?; match serde_json::from_slice(&index_part_bytes) { Ok(index_part) => { - return Ok(S3TimelineBlobData { + return Ok(RemoteTimelineBlobData { blob_data: BlobDataParseResult::Parsed { index_part: Box::new(index_part), index_part_generation, @@ -395,8 +449,8 @@ pub(crate) async fn list_timeline_blobs( ); } - Ok(S3TimelineBlobData { - blob_data: BlobDataParseResult::Incorrect(errors), + Ok(RemoteTimelineBlobData { + blob_data: BlobDataParseResult::Incorrect { errors, s3_layers }, unused_index_keys: index_part_keys, unknown_keys, }) diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs new file mode 100644 index 0000000000..88e36af560 --- /dev/null +++ b/storage_scrubber/src/find_large_objects.rs @@ -0,0 +1,115 @@ +use std::pin::pin; + +use futures::{StreamExt, TryStreamExt}; +use pageserver::tenant::storage_layer::LayerName; +use remote_storage::ListingMode; +use serde::{Deserialize, Serialize}; + +use crate::{ + checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants, + stream_objects_with_retries, BucketConfig, NodeKind, +}; + +#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] +enum LargeObjectKind { + DeltaLayer, + ImageLayer, + Other, +} + +impl LargeObjectKind { + fn from_key(key: &str) -> Self { + let fname = key.split('/').last().unwrap(); + + let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else { + return LargeObjectKind::Other; + }; + + match layer_name { + LayerName::Image(_) => LargeObjectKind::ImageLayer, + LayerName::Delta(_) => LargeObjectKind::DeltaLayer, + } + } +} + +#[derive(Serialize, Deserialize, Clone)] +pub struct LargeObject { + pub key: String, + pub size: u64, + kind: LargeObjectKind, +} + +#[derive(Serialize, Deserialize)] +pub struct LargeObjectListing { + pub objects: Vec, +} + +pub async fn find_large_objects( + bucket_config: BucketConfig, + min_size: u64, + ignore_deltas: bool, + concurrency: usize, +) -> anyhow::Result { + let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; + let tenants = pin!(stream_tenants(&remote_client, &target)); + + let objects_stream = tenants.map_ok(|tenant_shard_id| { + let mut tenant_root = target.tenant_root(&tenant_shard_id); + let remote_client = remote_client.clone(); + async move { + let mut objects = Vec::new(); + let mut total_objects_ctr = 0u64; + // We want the objects and not just common prefixes + tenant_root.delimiter.clear(); + let mut objects_stream = pin!(stream_objects_with_retries( + &remote_client, + ListingMode::NoDelimiter, + &tenant_root + )); + while let Some(listing) = objects_stream.next().await { + let listing = listing?; + for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) { + let key = obj.key.to_string(); + let kind = LargeObjectKind::from_key(&key); + if ignore_deltas && kind == LargeObjectKind::DeltaLayer { + continue; + } + objects.push(LargeObject { + key, + size: obj.size, + kind, + }) + } + total_objects_ctr += listing.keys.len() as u64; + } + + Ok((tenant_shard_id, objects, total_objects_ctr)) + } + }); + let mut objects_stream = std::pin::pin!(objects_stream.try_buffer_unordered(concurrency)); + + let mut objects = Vec::new(); + + let mut tenant_ctr = 0u64; + let mut object_ctr = 0u64; + while let Some(res) = objects_stream.next().await { + let (tenant_shard_id, objects_slice, total_objects_ctr) = res?; + objects.extend_from_slice(&objects_slice); + + object_ctr += total_objects_ctr; + tenant_ctr += 1; + if tenant_ctr % 100 == 0 { + tracing::info!( + "Scanned {tenant_ctr} shards. objects={object_ctr}, found={}, current={tenant_shard_id}.", + objects.len() + ); + } + } + + let bucket_name = target.bucket_name(); + tracing::info!( + "Scan of {bucket_name} finished. Scanned {tenant_ctr} shards. objects={object_ctr}, found={}.", + objects.len() + ); + Ok(LargeObjectListing { objects }) +} diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index ce0ff10ec6..3e22960f8d 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -5,30 +5,34 @@ use std::{ collections::{HashMap, HashSet}, sync::Arc, + time::Duration, }; use anyhow::Context; -use aws_sdk_s3::{ - types::{Delete, ObjectIdentifier}, - Client, -}; use futures_util::TryStreamExt; use pageserver_api::shard::TenantShardId; +use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; +use tokio_util::sync::CancellationToken; use utils::id::TenantId; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, - init_remote, - metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants}, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth, + init_remote, list_objects_with_retries, + metadata_stream::{stream_tenant_timelines, stream_tenants}, + BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, }; #[derive(Serialize, Deserialize, Debug)] enum GarbageReason { DeletedInConsole, MissingInConsole, + + // The remaining data relates to a known deletion issue, and we're sure that purging this + // will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where + // there is nothing in a tenant path apart from a heatmap file. + KnownBug, } #[derive(Serialize, Deserialize, Debug)] @@ -74,6 +78,15 @@ impl GarbageList { } } + /// If an entity has been identified as requiring purge due to a known bug, e.g. + /// a particular type of object left behind after an incomplete deletion. + fn append_buggy(&mut self, entity: GarbageEntity) { + self.items.push(GarbageItem { + entity, + reason: GarbageReason::KnownBug, + }); + } + /// Return true if appended, false if not. False means the result was not garbage. fn maybe_append(&mut self, entity: GarbageEntity, result: Option) -> bool where @@ -140,7 +153,7 @@ async fn find_garbage_inner( node_kind: NodeKind, ) -> anyhow::Result { // Construct clients for S3 and for Console API - let (s3_client, target) = init_remote(bucket_config.clone(), node_kind)?; + let (remote_client, target) = init_remote(bucket_config.clone(), node_kind).await?; let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config)); // Build a set of console-known tenants, for quickly eliminating known-active tenants without having @@ -166,7 +179,7 @@ async fn find_garbage_inner( // Enumerate Tenants in S3, and check if each one exists in Console tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket); - let tenants = stream_tenants(&s3_client, &target); + let tenants = stream_tenants(&remote_client, &target); let tenants_checked = tenants.map_ok(|t| { let api_client = cloud_admin_api_client.clone(); let console_cache = console_cache.clone(); @@ -221,6 +234,65 @@ async fn find_garbage_inner( assert!(project.tenant == tenant_shard_id.tenant_id); } + // Special case: If it's missing in console, check for known bugs that would enable us to conclusively + // identify it as purge-able anyway + if console_result.is_none() { + let timelines = stream_tenant_timelines(&remote_client, &target, tenant_shard_id) + .await? + .collect::>() + .await; + if timelines.is_empty() { + // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps + let tenant_objects = list_objects_with_retries( + &remote_client, + ListingMode::WithDelimiter, + &target.tenant_root(&tenant_shard_id), + ) + .await?; + let object = tenant_objects.keys.first().unwrap(); + if object.key.get_path().as_str().ends_with("heatmap-v1.json") { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); + garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); + continue; + } else { + tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + } + } else { + // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial + // rollout of WAL DR in which we never deleted these. + let mut any_non_initdb = false; + + for timeline_r in timelines { + let timeline = timeline_r?; + let timeline_objects = list_objects_with_retries( + &remote_client, + ListingMode::WithDelimiter, + &target.timeline_root(&timeline), + ) + .await?; + if !timeline_objects.prefixes.is_empty() { + // Sub-paths? Unexpected + any_non_initdb = true; + } else { + let object = timeline_objects.keys.first().unwrap(); + if object.key.get_path().as_str().ends_with("initdb.tar.zst") { + tracing::info!("Timeline {timeline} contains only initdb.tar.zst"); + } else { + any_non_initdb = true; + } + } + } + + if any_non_initdb { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb"); + } else { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb"); + garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); + continue; + } + } + } + if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) { tracing::debug!("Tenant {tenant_shard_id} is garbage"); } else { @@ -258,7 +330,7 @@ async fn find_garbage_inner( // Construct a stream of all timelines within active tenants let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok)); - let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t)); + let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, *t)); let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY); let timelines = timelines.try_flatten(); @@ -324,41 +396,42 @@ impl std::fmt::Display for PurgeMode { } pub async fn get_tenant_objects( - s3_client: &Arc, - target: RootTarget, + s3_client: &GenericRemoteStorage, tenant_shard_id: TenantShardId, -) -> anyhow::Result> { +) -> anyhow::Result> { tracing::debug!("Listing objects in tenant {tenant_shard_id}"); + let tenant_root = super::remote_tenant_path(&tenant_shard_id); + // TODO: apply extra validation based on object modification time. Don't purge // tenants where any timeline's index_part.json has been touched recently. - let mut tenant_root = target.tenant_root(&tenant_shard_id); - - // Remove delimiter, so that object listing lists all keys in the prefix and not just - // common prefixes. - tenant_root.delimiter = String::new(); - - let key_stream = stream_listing(s3_client, &tenant_root); - key_stream.try_collect().await + let list = s3_client + .list( + Some(&tenant_root), + ListingMode::NoDelimiter, + None, + &CancellationToken::new(), + ) + .await?; + Ok(list.keys) } pub async fn get_timeline_objects( - s3_client: &Arc, - target: RootTarget, + s3_client: &GenericRemoteStorage, ttid: TenantShardTimelineId, -) -> anyhow::Result> { +) -> anyhow::Result> { tracing::debug!("Listing objects in timeline {ttid}"); - let mut timeline_root = target.timeline_root(&ttid); + let timeline_root = super::remote_timeline_path_id(&ttid); - // TODO: apply extra validation based on object modification time. Don't purge - // timelines whose index_part.json has been touched recently. - - // Remove delimiter, so that object listing lists all keys in the prefix and not just - // common prefixes. - timeline_root.delimiter = String::new(); - let key_stream = stream_listing(s3_client, &timeline_root); - - key_stream.try_collect().await + let list = s3_client + .list( + Some(&timeline_root), + ListingMode::NoDelimiter, + None, + &CancellationToken::new(), + ) + .await?; + Ok(list.keys) } const MAX_KEYS_PER_DELETE: usize = 1000; @@ -369,16 +442,19 @@ const MAX_KEYS_PER_DELETE: usize = 1000; /// MAX_KEYS_PER_DELETE keys are left. /// `num_deleted` returns number of deleted keys. async fn do_delete( - s3_client: &Arc, - bucket_name: &str, - keys: &mut Vec, + remote_client: &GenericRemoteStorage, + keys: &mut Vec, dry_run: bool, drain: bool, progress_tracker: &mut DeletionProgressTracker, ) -> anyhow::Result<()> { + let cancel = CancellationToken::new(); while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) { let request_keys = keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len()))); + + let request_keys: Vec = request_keys.into_iter().map(|o| o.key).collect(); + let num_deleted = request_keys.len(); if dry_run { tracing::info!("Dry-run deletion of objects: "); @@ -386,14 +462,10 @@ async fn do_delete( tracing::info!(" {k:?}"); } } else { - let delete_request = s3_client - .delete_objects() - .bucket(bucket_name) - .delete(Delete::builder().set_objects(Some(request_keys)).build()?); - delete_request - .send() + remote_client + .delete_objects(&request_keys, &cancel) .await - .context("DeleteObjects request")?; + .context("deletetion request")?; progress_tracker.register(num_deleted); } } @@ -421,6 +493,7 @@ impl DeletionProgressTracker { pub async fn purge_garbage( input_path: String, mode: PurgeMode, + min_age: Duration, dry_run: bool, ) -> anyhow::Result<()> { let list_bytes = tokio::fs::read(&input_path).await?; @@ -431,8 +504,13 @@ pub async fn purge_garbage( input_path ); - let (s3_client, target) = - init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind)?; + let (remote_client, _target) = + init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?; + + assert_eq!( + &garbage_list.bucket_config.bucket, + remote_client.bucket_name().unwrap() + ); // Sanity checks on the incoming list if garbage_list.active_tenant_count == 0 { @@ -453,6 +531,7 @@ pub async fn purge_garbage( .filter(|i| match (&mode, &i.reason) { (PurgeMode::DeletedAndMissing, _) => true, (PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true, + (PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true, (PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false, }); @@ -464,16 +543,13 @@ pub async fn purge_garbage( let items = tokio_stream::iter(filtered_items.map(Ok)); let get_objects_results = items.map_ok(|i| { - let s3_client = s3_client.clone(); - let target = target.clone(); + let remote_client = remote_client.clone(); async move { match i.entity { GarbageEntity::Tenant(tenant_id) => { - get_tenant_objects(&s3_client, target, tenant_id).await - } - GarbageEntity::Timeline(ttid) => { - get_timeline_objects(&s3_client, target, ttid).await + get_tenant_objects(&remote_client, tenant_id).await } + GarbageEntity::Timeline(ttid) => get_timeline_objects(&remote_client, ttid).await, } } }); @@ -484,11 +560,41 @@ pub async fn purge_garbage( let mut progress_tracker = DeletionProgressTracker::default(); while let Some(result) = get_objects_results.next().await { let mut object_list = result?; + + // Extra safety check: even if a collection of objects is garbage, check max() of modification + // times before purging, so that if we incorrectly marked a live tenant as garbage then we would + // notice that its index has been written recently and would omit deleting it. + if object_list.is_empty() { + // Simplify subsequent code by ensuring list always has at least one item + // Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes + continue; + } + let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap(); + let age = max_mtime.elapsed(); + match age { + Err(_) => { + tracing::warn!("Bad last_modified time"); + continue; + } + Ok(a) if a < min_age => { + // Failed age check. This doesn't mean we did something wrong: a tenant might really be garbage and recently + // written, but out of an abundance of caution we still don't purge it. + tracing::info!( + "Skipping tenant with young objects {}..{}", + object_list.first().as_ref().unwrap().key, + object_list.last().as_ref().unwrap().key + ); + continue; + } + Ok(_) => { + // Passed age check + } + } + objects_to_delete.append(&mut object_list); if objects_to_delete.len() >= MAX_KEYS_PER_DELETE { do_delete( - &s3_client, - &garbage_list.bucket_config.bucket, + &remote_client, &mut objects_to_delete, dry_run, false, @@ -499,8 +605,7 @@ pub async fn purge_garbage( } do_delete( - &s3_client, - &garbage_list.bucket_config.bucket, + &remote_client, &mut objects_to_delete, dry_run, true, diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 64273432fc..112f052e07 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -2,6 +2,7 @@ #![deny(clippy::undocumented_unsafe_blocks)] pub mod checks; pub mod cloud_admin_api; +pub mod find_large_objects; pub mod garbage; pub mod metadata_stream; pub mod pageserver_physical_gc; @@ -15,29 +16,31 @@ use std::sync::Arc; use std::time::Duration; use anyhow::Context; -use aws_config::environment::EnvironmentVariableCredentialsProvider; -use aws_config::imds::credentials::ImdsCredentialsProvider; -use aws_config::meta::credentials::CredentialsProviderChain; -use aws_config::profile::ProfileFileCredentialsProvider; -use aws_config::retry::RetryConfig; -use aws_config::sso::SsoCredentialsProvider; -use aws_config::BehaviorVersion; -use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep}; -use aws_sdk_s3::{Client, Config}; -use aws_smithy_async::rt::sleep::TokioSleep; +use aws_config::retry::{RetryConfigBuilder, RetryMode}; +use aws_sdk_s3::config::Region; +use aws_sdk_s3::error::DisplayErrorContext; +use aws_sdk_s3::Client; use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; +use futures::{Stream, StreamExt}; +use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver::tenant::TENANTS_SEGMENT_NAME; use pageserver_api::shard::TenantShardId; +use remote_storage::{ + GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind, + S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, +}; use reqwest::Url; use serde::{Deserialize, Serialize}; +use storage_controller_client::control_api; use tokio::io::AsyncReadExt; +use tokio_util::sync::CancellationToken; use tracing::error; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; use utils::fs_ext; -use utils::id::{TenantId, TimelineId}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; const MAX_RETRIES: usize = 20; const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; @@ -60,7 +63,7 @@ pub struct S3Target { /// in the pageserver, as all timeline objects existing in the scope of a particular /// tenant: the scrubber is different in that it handles collections of data referring to many /// TenantShardTimelineIds in on place. -#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct TenantShardTimelineId { tenant_shard_id: TenantShardId, timeline_id: TimelineId, @@ -73,6 +76,10 @@ impl TenantShardTimelineId { timeline_id, } } + + fn as_tenant_timeline_id(&self) -> TenantTimelineId { + TenantTimelineId::new(self.tenant_shard_id.tenant_id, self.timeline_id) + } } impl Display for TenantShardTimelineId { @@ -185,6 +192,22 @@ impl RootTarget { .with_sub_segment(&id.timeline_id.to_string()) } + /// Given RemotePath "tenants/foo/timelines/bar/layerxyz", prefix it to a literal + /// key in the S3 bucket. + pub fn absolute_key(&self, key: &RemotePath) -> String { + let root = match self { + Self::Pageserver(root) => root, + Self::Safekeeper(root) => root, + }; + + let prefix = &root.prefix_in_bucket; + if prefix.ends_with('/') { + format!("{prefix}{key}") + } else { + format!("{prefix}/{key}") + } + } + pub fn bucket_name(&self) -> &str { match self { Self::Pageserver(root) => &root.bucket_name, @@ -200,6 +223,10 @@ impl RootTarget { } } +pub fn remote_timeline_path_id(id: &TenantShardTimelineId) -> RemotePath { + remote_timeline_path(&id.tenant_shard_id, &id.timeline_id) +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct BucketConfig { @@ -222,6 +249,20 @@ impl BucketConfig { } } +pub struct ControllerClientConfig { + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + pub controller_api: Url, + + /// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'. + pub controller_jwt: String, +} + +impl ControllerClientConfig { + pub fn build_client(self) -> control_api::Client { + control_api::Client::new(self.controller_api, Some(self.controller_jwt)) + } +} + pub struct ConsoleConfig { pub token: String, pub base_url: Url, @@ -241,168 +282,234 @@ impl ConsoleConfig { } } -pub fn init_logging(file_name: &str) -> WorkerGuard { - let (file_writer, guard) = - tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name)); - - let file_logs = fmt::Layer::new() - .with_target(false) - .with_ansi(false) - .with_writer(file_writer); +pub fn init_logging(file_name: &str) -> Option { let stderr_logs = fmt::Layer::new() .with_target(false) .with_writer(std::io::stderr); - tracing_subscriber::registry() - .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) - .with(file_logs) - .with(stderr_logs) - .init(); - guard -} - -pub fn init_s3_client(bucket_region: Region) -> Client { - let credentials_provider = { - // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - let chain = CredentialsProviderChain::first_try( - "env", - EnvironmentVariableCredentialsProvider::new(), - ) - // uses "AWS_PROFILE" / `aws sso login --profile ` - .or_else( - "profile-sso", - ProfileFileCredentialsProvider::builder().build(), - ); - - // Use SSO if we were given an account ID - match std::env::var("SSO_ACCOUNT_ID").ok() { - Some(sso_account) => chain.or_else( - "sso", - SsoCredentialsProvider::builder() - .account_id(sso_account) - .role_name("PowerUserAccess") - .start_url("https://neondb.awsapps.com/start") - .region(bucket_region.clone()) - .build(), - ), - None => chain, - } - .or_else( - // Finally try IMDS - "imds", - ImdsCredentialsProvider::builder().build(), - ) + let disable_file_logging = match std::env::var("PAGESERVER_DISABLE_FILE_LOGGING") { + Ok(s) => s == "1" || s.to_lowercase() == "true", + Err(_) => false, }; - let sleep_impl: Arc = Arc::new(TokioSleep::new()); - - let mut builder = Config::builder() - .behavior_version( - #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ - BehaviorVersion::v2023_11_09(), - ) - .region(bucket_region) - .retry_config(RetryConfig::adaptive().with_max_attempts(3)) - .sleep_impl(SharedAsyncSleep::from(sleep_impl)) - .credentials_provider(credentials_provider); - - if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") { - builder = builder.endpoint_url(endpoint) + if disable_file_logging { + tracing_subscriber::registry() + .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) + .with(stderr_logs) + .init(); + None + } else { + let (file_writer, guard) = + tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name)); + let file_logs = fmt::Layer::new() + .with_target(false) + .with_ansi(false) + .with_writer(file_writer); + tracing_subscriber::registry() + .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) + .with(stderr_logs) + .with(file_logs) + .init(); + Some(guard) } - - Client::from_conf(builder.build()) } -fn init_remote( +async fn init_s3_client(bucket_region: Region) -> Client { + let mut retry_config_builder = RetryConfigBuilder::new(); + + retry_config_builder + .set_max_attempts(Some(3)) + .set_mode(Some(RetryMode::Adaptive)); + + let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28()) + .region(bucket_region) + .retry_config(retry_config_builder.build()) + .load() + .await; + Client::new(&config) +} + +fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str { + match node_kind { + NodeKind::Pageserver => "pageserver/v1/", + NodeKind::Safekeeper => "wal/", + } +} + +fn make_root_target( + bucket_name: String, + prefix_in_bucket: String, + node_kind: NodeKind, +) -> RootTarget { + let s3_target = S3Target { + bucket_name, + prefix_in_bucket, + delimiter: "/".to_string(), + }; + match node_kind { + NodeKind::Pageserver => RootTarget::Pageserver(s3_target), + NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target), + } +} + +async fn init_remote_s3( bucket_config: BucketConfig, node_kind: NodeKind, ) -> anyhow::Result<(Arc, RootTarget)> { let bucket_region = Region::new(bucket_config.region); - let delimiter = "/".to_string(); - let s3_client = Arc::new(init_s3_client(bucket_region)); + let s3_client = Arc::new(init_s3_client(bucket_region).await); + let default_prefix = default_prefix_in_bucket(node_kind).to_string(); - let s3_root = match node_kind { - NodeKind::Pageserver => RootTarget::Pageserver(S3Target { - bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config - .prefix_in_bucket - .unwrap_or("pageserver/v1".to_string()), - delimiter, - }), - NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target { - bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()), - delimiter, - }), - }; + let s3_root = make_root_target( + bucket_config.bucket, + bucket_config.prefix_in_bucket.unwrap_or(default_prefix), + node_kind, + ); Ok((s3_client, s3_root)) } +async fn init_remote( + bucket_config: BucketConfig, + node_kind: NodeKind, +) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> { + let endpoint = env::var("AWS_ENDPOINT_URL").ok(); + let default_prefix = default_prefix_in_bucket(node_kind).to_string(); + let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix)); + let storage = S3Config { + bucket_name: bucket_config.bucket.clone(), + bucket_region: bucket_config.region, + prefix_in_bucket, + endpoint, + concurrency_limit: DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT + .try_into() + .unwrap(), + max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + upload_storage_class: None, + }; + let storage_config = RemoteStorageConfig { + storage: RemoteStorageKind::AwsS3(storage), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + }; + + // We already pass the prefix to the remote client above + let prefix_in_root_target = String::new(); + let root_target = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind); + + let client = GenericRemoteStorage::from_config(&storage_config).await?; + Ok((client, root_target)) +} + +/// Listing possibly large amounts of keys in a streaming fashion. +fn stream_objects_with_retries<'a>( + storage_client: &'a GenericRemoteStorage, + listing_mode: ListingMode, + s3_target: &'a S3Target, +) -> impl Stream> + 'a { + async_stream::stream! { + let mut trial = 0; + let cancel = CancellationToken::new(); + let prefix_str = &s3_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&s3_target.prefix_in_bucket); + let prefix = RemotePath::from_string(prefix_str)?; + let mut list_stream = + storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel); + while let Some(res) = list_stream.next().await { + match res { + Err(err) => { + let yield_err = if err.is_permanent() { + true + } else { + let backoff_time = 1 << trial.max(5); + tokio::time::sleep(Duration::from_secs(backoff_time)).await; + trial += 1; + trial == MAX_RETRIES - 1 + }; + if yield_err { + yield Err(err) + .with_context(|| format!("Failed to list objects {MAX_RETRIES} times")); + break; + } + } + Ok(res) => { + trial = 0; + yield Ok(res); + } + } + } + } +} + +/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes, +/// use [`stream_objects_with_retries`] instead. async fn list_objects_with_retries( - s3_client: &Client, + remote_client: &GenericRemoteStorage, + listing_mode: ListingMode, s3_target: &S3Target, - continuation_token: Option, -) -> anyhow::Result { - for _ in 0..MAX_RETRIES { - match s3_client - .list_objects_v2() - .bucket(&s3_target.bucket_name) - .prefix(&s3_target.prefix_in_bucket) - .delimiter(&s3_target.delimiter) - .set_continuation_token(continuation_token.clone()) - .send() +) -> anyhow::Result { + let cancel = CancellationToken::new(); + let prefix_str = &s3_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&s3_target.prefix_in_bucket); + let prefix = RemotePath::from_string(prefix_str)?; + for trial in 0..MAX_RETRIES { + match remote_client + .list(Some(&prefix), listing_mode, None, &cancel) .await { Ok(response) => return Ok(response), Err(e) => { + if trial == MAX_RETRIES - 1 { + return Err(e) + .with_context(|| format!("Failed to list objects {MAX_RETRIES} times")); + } error!( - "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}", - s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter + "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}", + s3_target.bucket_name, + s3_target.prefix_in_bucket, + s3_target.delimiter, + DisplayErrorContext(e), ); - tokio::time::sleep(Duration::from_secs(1)).await; + let backoff_time = 1 << trial.max(5); + tokio::time::sleep(Duration::from_secs(backoff_time)).await; } } } - - anyhow::bail!("Failed to list objects {MAX_RETRIES} times") + panic!("MAX_RETRIES is not allowed to be 0"); } async fn download_object_with_retries( - s3_client: &Client, - bucket_name: &str, - key: &str, + remote_client: &GenericRemoteStorage, + key: &RemotePath, ) -> anyhow::Result> { - for _ in 0..MAX_RETRIES { - let mut body_buf = Vec::new(); - let response_stream = match s3_client - .get_object() - .bucket(bucket_name) - .key(key) - .send() - .await - { + let cancel = CancellationToken::new(); + for trial in 0..MAX_RETRIES { + let mut buf = Vec::new(); + let download = match remote_client.download(key, &cancel).await { Ok(response) => response, Err(e) => { error!("Failed to download object for key {key}: {e}"); - tokio::time::sleep(Duration::from_secs(1)).await; + let backoff_time = 1 << trial.max(5); + tokio::time::sleep(Duration::from_secs(backoff_time)).await; continue; } }; - match response_stream - .body - .into_async_read() - .read_to_end(&mut body_buf) + match tokio_util::io::StreamReader::new(download.download_stream) + .read_to_end(&mut buf) .await { Ok(bytes_read) => { tracing::debug!("Downloaded {bytes_read} bytes for object {key}"); - return Ok(body_buf); + return Ok(buf); } Err(e) => { error!("Failed to stream object body for key {key}: {e}"); - tokio::time::sleep(Duration::from_secs(1)).await; + let backoff_time = 1 << trial.max(5); + tokio::time::sleep(Duration::from_secs(backoff_time)).await; } } } @@ -410,7 +517,7 @@ async fn download_object_with_retries( anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") } -async fn download_object_to_file( +async fn download_object_to_file_s3( s3_client: &Client, bucket_name: &str, key: &str, diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index 222bd10ed2..3935e513e3 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,10 +1,14 @@ -use anyhow::bail; +use anyhow::{anyhow, bail}; use camino::Utf8PathBuf; +use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; use pageserver_api::shard::TenantShardId; +use reqwest::{Method, Url}; +use storage_controller_client::control_api; use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use storage_scrubber::pageserver_physical_gc::GcMode; -use storage_scrubber::scan_pageserver_metadata::scan_metadata; +use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata; use storage_scrubber::tenant_snapshot::SnapshotDownloader; +use storage_scrubber::{find_large_objects, ControllerClientConfig}; use storage_scrubber::{ init_logging, pageserver_physical_gc::pageserver_physical_gc, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind, @@ -14,6 +18,11 @@ use storage_scrubber::{ use clap::{Parser, Subcommand}; use utils::id::TenantId; +use utils::{project_build_tag, project_git_version}; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + #[derive(Parser)] #[command(author, version, about, long_about = None)] #[command(arg_required_else_help(true))] @@ -23,6 +32,14 @@ struct Cli { #[arg(short, long, default_value_t = false)] delete: bool, + + #[arg(long)] + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + controller_api: Option, + + #[arg(long)] + /// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'. + controller_jwt: Option, } #[derive(Subcommand, Debug)] @@ -40,6 +57,8 @@ enum Command { input_path: String, #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)] mode: PurgeMode, + #[arg(long = "min-age")] + min_age: humantime::Duration, }, #[command(verbatim_doc_comment)] ScanMetadata { @@ -49,6 +68,8 @@ enum Command { json: bool, #[arg(long = "tenant-id", num_args = 0..)] tenant_ids: Vec, + #[arg(long = "post", default_value_t = false)] + post_to_storcon: bool, #[arg(long, default_value = None)] /// For safekeeper node_kind only, points to db with debug dump dump_db_connstr: Option, @@ -72,12 +93,32 @@ enum Command { #[arg(short, long, default_value_t = GcMode::IndicesOnly)] mode: GcMode, }, + FindLargeObjects { + #[arg(long = "min-size")] + min_size: u64, + #[arg(short, long, default_value_t = false)] + ignore_deltas: bool, + #[arg(long = "concurrency", short = 'j', default_value_t = 64)] + concurrency: usize, + }, + CronJob { + // PageserverPhysicalGc + #[arg(long = "min-age")] + gc_min_age: humantime::Duration, + #[arg(short, long, default_value_t = GcMode::IndicesOnly)] + gc_mode: GcMode, + // ScanMetadata + #[arg(long = "post", default_value_t = false)] + post_to_storcon: bool, + }, } #[tokio::main] async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); + tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG); + let bucket_config = BucketConfig::from_env()?; let command_log_name = match &cli.command { @@ -86,6 +127,8 @@ async fn main() -> anyhow::Result<()> { Command::PurgeGarbage { .. } => "purge-garbage", Command::TenantSnapshot { .. } => "tenant-snapshot", Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc", + Command::FindLargeObjects { .. } => "find-large-objects", + Command::CronJob { .. } => "cron-job", }; let _guard = init_logging(&format!( "{}_{}_{}_{}.log", @@ -95,11 +138,21 @@ async fn main() -> anyhow::Result<()> { chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") )); + let controller_client = cli.controller_api.map(|controller_api| { + ControllerClientConfig { + controller_api, + // Default to no key: this is a convenience when working in a development environment + controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()), + } + .build_client() + }); + match cli.command { Command::ScanMetadata { json, tenant_ids, node_kind, + post_to_storcon, dump_db_connstr, dump_db_table, } => { @@ -138,35 +191,14 @@ async fn main() -> anyhow::Result<()> { } Ok(()) } else { - match scan_metadata(bucket_config.clone(), tenant_ids).await { - Err(e) => { - tracing::error!("Failed: {e}"); - Err(e) - } - Ok(summary) => { - if json { - println!("{}", serde_json::to_string(&summary).unwrap()) - } else { - println!("{}", summary.summary_string()); - } - if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) - } else if summary.is_empty() { - // Strictly speaking an empty bucket is a valid bucket, but if someone ran the - // scrubber they were likely expecting to scan something, and if we see no timelines - // at all then it's likely due to some configuration issues like a bad prefix - Err(anyhow::anyhow!( - "No timelines found in bucket {} prefix {}", - bucket_config.bucket, - bucket_config - .prefix_in_bucket - .unwrap_or("".to_string()) - )) - } else { - Ok(()) - } - } - } + scan_pageserver_metadata_cmd( + bucket_config, + controller_client.as_ref(), + tenant_ids, + json, + post_to_storcon, + ) + .await } } Command::FindGarbage { @@ -177,16 +209,18 @@ async fn main() -> anyhow::Result<()> { let console_config = ConsoleConfig::from_env()?; find_garbage(bucket_config, console_config, depth, node_kind, output_path).await } - Command::PurgeGarbage { input_path, mode } => { - purge_garbage(input_path, mode, !cli.delete).await - } + Command::PurgeGarbage { + input_path, + mode, + min_age, + } => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await, Command::TenantSnapshot { tenant_id, output_path, concurrency, } => { let downloader = - SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?; + SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency).await?; downloader.download().await } Command::PageserverPhysicalGc { @@ -194,10 +228,166 @@ async fn main() -> anyhow::Result<()> { min_age, mode, } => { - let summary = - pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?; + pageserver_physical_gc_cmd( + &bucket_config, + controller_client.as_ref(), + tenant_ids, + min_age, + mode, + ) + .await + } + Command::FindLargeObjects { + min_size, + ignore_deltas, + concurrency, + } => { + let summary = find_large_objects::find_large_objects( + bucket_config, + min_size, + ignore_deltas, + concurrency, + ) + .await?; println!("{}", serde_json::to_string(&summary).unwrap()); Ok(()) } + Command::CronJob { + gc_min_age, + gc_mode, + post_to_storcon, + } => { + run_cron_job( + bucket_config, + controller_client.as_ref(), + gc_min_age, + gc_mode, + post_to_storcon, + ) + .await + } + } +} + +/// Runs the scrubber cron job. +/// 1. Do pageserver physical gc +/// 2. Scan pageserver metadata +pub async fn run_cron_job( + bucket_config: BucketConfig, + controller_client: Option<&control_api::Client>, + gc_min_age: humantime::Duration, + gc_mode: GcMode, + post_to_storcon: bool, +) -> anyhow::Result<()> { + tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc"); + pageserver_physical_gc_cmd( + &bucket_config, + controller_client, + Vec::new(), + gc_min_age, + gc_mode, + ) + .await?; + tracing::info!(%post_to_storcon, node_kind = %NodeKind::Pageserver, "Running scan-metadata"); + scan_pageserver_metadata_cmd( + bucket_config, + controller_client, + Vec::new(), + true, + post_to_storcon, + ) + .await?; + + Ok(()) +} + +pub async fn pageserver_physical_gc_cmd( + bucket_config: &BucketConfig, + controller_client: Option<&control_api::Client>, + tenant_shard_ids: Vec, + min_age: humantime::Duration, + mode: GcMode, +) -> anyhow::Result<()> { + match (controller_client, mode) { + (Some(_), _) => { + // Any mode may run when controller API is set + } + (None, GcMode::Full) => { + // The part of physical GC where we erase ancestor layers cannot be done safely without + // confirming the most recent complete shard split with the controller. Refuse to run, rather + // than doing it unsafely. + return Err(anyhow!( + "Full physical GC requires `--controller-api` and `--controller-jwt` to run" + )); + } + (None, GcMode::DryRun | GcMode::IndicesOnly) => { + // These GcModes do not require the controller to run. + } + } + + let summary = pageserver_physical_gc( + bucket_config, + controller_client, + tenant_shard_ids, + min_age.into(), + mode, + ) + .await?; + println!("{}", serde_json::to_string(&summary).unwrap()); + Ok(()) +} + +pub async fn scan_pageserver_metadata_cmd( + bucket_config: BucketConfig, + controller_client: Option<&control_api::Client>, + tenant_shard_ids: Vec, + json: bool, + post_to_storcon: bool, +) -> anyhow::Result<()> { + if controller_client.is_none() && post_to_storcon { + return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run")); + } + match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) + } + Ok(summary) => { + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + + if post_to_storcon { + if let Some(client) = controller_client { + let body = summary.build_health_update_request(); + client + .dispatch::( + Method::POST, + "control/v1/metadata_health/update".to_string(), + Some(body), + ) + .await?; + } + } + + if summary.is_fatal() { + tracing::error!("Fatal scrub errors detected"); + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + tracing::error!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + ); + } + + Ok(()) + } } } diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs index c05874f556..10d77937f1 100644 --- a/storage_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -1,98 +1,70 @@ -use anyhow::Context; +use std::str::FromStr; + +use anyhow::{anyhow, Context}; use async_stream::{stream, try_stream}; -use aws_sdk_s3::{types::ObjectIdentifier, Client}; +use futures::StreamExt; +use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; use tokio_stream::Stream; -use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId}; +use crate::{ + list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target, + TenantShardTimelineId, +}; use pageserver_api::shard::TenantShardId; use utils::id::{TenantId, TimelineId}; -/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2 +/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes pub fn stream_tenants<'a>( - s3_client: &'a Client, + remote_client: &'a GenericRemoteStorage, target: &'a RootTarget, ) -> impl Stream> + 'a { try_stream! { - let mut continuation_token = None; let tenants_target = target.tenants_root(); - loop { - let fetch_response = - list_objects_with_retries(s3_client, &tenants_target, continuation_token.clone()).await?; - - let new_entry_ids = fetch_response - .common_prefixes() - .iter() - .filter_map(|prefix| prefix.prefix()) - .filter_map(|prefix| -> Option<&str> { - prefix - .strip_prefix(&tenants_target.prefix_in_bucket)? - .strip_suffix('/') - }).map(|entry_id_str| { - entry_id_str - .parse() - .with_context(|| format!("Incorrect entry id str: {entry_id_str}")) - }); - - for i in new_entry_ids { - yield i?; - } - - match fetch_response.next_continuation_token { - Some(new_token) => continuation_token = Some(new_token), - None => break, + let mut tenants_stream = + std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target)); + while let Some(chunk) = tenants_stream.next().await { + let chunk = chunk?; + let entry_ids = chunk.prefixes.iter() + .map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'"))); + for dir_name_res in entry_ids { + let dir_name = dir_name_res?; + let id = TenantShardId::from_str(dir_name)?; + yield id; } } } } pub async fn stream_tenant_shards<'a>( - s3_client: &'a Client, + remote_client: &'a GenericRemoteStorage, target: &'a RootTarget, tenant_id: TenantId, ) -> anyhow::Result> + 'a> { - let mut tenant_shard_ids: Vec> = Vec::new(); - let mut continuation_token = None; let shards_target = target.tenant_shards_prefix(&tenant_id); - loop { - tracing::info!("Listing in {}", shards_target.prefix_in_bucket); - let fetch_response = - list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await; - let fetch_response = match fetch_response { - Err(e) => { - tenant_shard_ids.push(Err(e)); - break; - } - Ok(r) => r, - }; + let strip_prefix = target.tenants_root().prefix_in_bucket; + let prefix_str = &strip_prefix.strip_prefix("/").unwrap_or(&strip_prefix); - let new_entry_ids = fetch_response - .common_prefixes() - .iter() - .filter_map(|prefix| prefix.prefix()) - .filter_map(|prefix| -> Option<&str> { - prefix - .strip_prefix(&target.tenants_root().prefix_in_bucket)? - .strip_suffix('/') - }) - .map(|entry_id_str| { - let first_part = entry_id_str.split('/').next().unwrap(); + tracing::info!("Listing shards in {}", shards_target.prefix_in_bucket); + let listing = + list_objects_with_retries(remote_client, ListingMode::WithDelimiter, &shards_target) + .await?; - first_part - .parse::() - .with_context(|| format!("Incorrect entry id str: {first_part}")) - }); + let tenant_shard_ids = listing + .prefixes + .iter() + .map(|prefix| prefix.get_path().as_str()) + .filter_map(|prefix| -> Option<&str> { prefix.strip_prefix(prefix_str) }) + .map(|entry_id_str| { + let first_part = entry_id_str.split('/').next().unwrap(); - for i in new_entry_ids { - tenant_shard_ids.push(i); - } - - match fetch_response.next_continuation_token { - Some(new_token) => continuation_token = Some(new_token), - None => break, - } - } + first_part + .parse::() + .with_context(|| format!("Incorrect entry id str: {first_part}")) + }) + .collect::>(); + tracing::debug!("Yielding {} shards for {tenant_id}", tenant_shard_ids.len()); Ok(stream! { for i in tenant_shard_ids { let id = i?; @@ -101,39 +73,43 @@ pub async fn stream_tenant_shards<'a>( }) } -/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered -/// using ListObjectsv2. The listing is done before the stream is built, so that this +/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered +/// using a listing. The listing is done before the stream is built, so that this /// function can be used to generate concurrency on a stream using buffer_unordered. pub async fn stream_tenant_timelines<'a>( - s3_client: &'a Client, + remote_client: &'a GenericRemoteStorage, target: &'a RootTarget, tenant: TenantShardId, ) -> anyhow::Result> + 'a> { let mut timeline_ids: Vec> = Vec::new(); - let mut continuation_token = None; let timelines_target = target.timelines_root(&tenant); + let prefix_str = &timelines_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&timelines_target.prefix_in_bucket); + + let mut objects_stream = std::pin::pin!(stream_objects_with_retries( + remote_client, + ListingMode::WithDelimiter, + &timelines_target + )); loop { - tracing::debug!("Listing in {}", tenant); - let fetch_response = - list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone()) - .await; - let fetch_response = match fetch_response { - Err(e) => { + tracing::debug!("Listing in {tenant}"); + let fetch_response = match objects_stream.next().await { + None => break, + Some(Err(e)) => { timeline_ids.push(Err(e)); break; } - Ok(r) => r, + Some(Ok(r)) => r, }; let new_entry_ids = fetch_response - .common_prefixes() + .prefixes .iter() - .filter_map(|prefix| prefix.prefix()) .filter_map(|prefix| -> Option<&str> { - prefix - .strip_prefix(&timelines_target.prefix_in_bucket)? - .strip_suffix('/') + prefix.get_path().as_str().strip_prefix(prefix_str) }) .map(|entry_id_str| { entry_id_str @@ -144,14 +120,9 @@ pub async fn stream_tenant_timelines<'a>( for i in new_entry_ids { timeline_ids.push(i); } - - match fetch_response.next_continuation_token { - Some(new_token) => continuation_token = Some(new_token), - None => break, - } } - tracing::debug!("Yielding for {}", tenant); + tracing::debug!("Yielding {} timelines for {}", timeline_ids.len(), tenant); Ok(stream! { for i in timeline_ids { let id = i?; @@ -161,32 +132,31 @@ pub async fn stream_tenant_timelines<'a>( } pub(crate) fn stream_listing<'a>( - s3_client: &'a Client, + remote_client: &'a GenericRemoteStorage, target: &'a S3Target, -) -> impl Stream> + 'a { +) -> impl Stream)>> + 'a { + let listing_mode = if target.delimiter.is_empty() { + ListingMode::NoDelimiter + } else { + ListingMode::WithDelimiter + }; try_stream! { - let mut continuation_token = None; - loop { - let fetch_response = - list_objects_with_retries(s3_client, target, continuation_token.clone()).await?; - + let mut objects_stream = std::pin::pin!(stream_objects_with_retries( + remote_client, + listing_mode, + target, + )); + while let Some(list) = objects_stream.next().await { + let list = list?; if target.delimiter.is_empty() { - for object_key in fetch_response.contents().iter().filter_map(|object| object.key()) - { - let object_id = ObjectIdentifier::builder().key(object_key).build()?; - yield object_id; + for key in list.keys { + yield (key.key.clone(), Some(key)); } } else { - for prefix in fetch_response.common_prefixes().iter().filter_map(|p| p.prefix()) { - let object_id = ObjectIdentifier::builder().key(prefix).build()?; - yield object_id; + for key in list.prefixes { + yield (key, None); } } - - match fetch_response.next_continuation_token { - Some(new_token) => continuation_token = Some(new_token), - None => break, - } } } } diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 0146433128..88681e38c2 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -1,22 +1,48 @@ -use std::time::{Duration, UNIX_EPOCH}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::sync::Arc; +use std::time::Duration; use crate::checks::{list_timeline_blobs, BlobDataParseResult}; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; -use aws_sdk_s3::Client; use futures_util::{StreamExt, TryStreamExt}; -use pageserver::tenant::remote_timeline_client::parse_remote_index_path; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; -use pageserver_api::shard::TenantShardId; -use remote_storage::RemotePath; +use pageserver_api::controller_api::TenantDescribeResponse; +use pageserver_api::shard::{ShardIndex, TenantShardId}; +use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; +use reqwest::Method; use serde::Serialize; +use storage_controller_client::control_api; +use tokio_util::sync::CancellationToken; use tracing::{info_span, Instrument}; use utils::generation::Generation; +use utils::id::{TenantId, TenantTimelineId}; #[derive(Serialize, Default)] pub struct GcSummary { indices_deleted: usize, remote_storage_errors: usize, + controller_api_errors: usize, + ancestor_layers_deleted: usize, +} + +impl GcSummary { + fn merge(&mut self, other: Self) { + let Self { + indices_deleted, + remote_storage_errors, + ancestor_layers_deleted, + controller_api_errors, + } = other; + + self.indices_deleted += indices_deleted; + self.remote_storage_errors += remote_storage_errors; + self.ancestor_layers_deleted += ancestor_layers_deleted; + self.controller_api_errors += controller_api_errors; + } } #[derive(clap::ValueEnum, Debug, Clone, Copy)] @@ -26,9 +52,9 @@ pub enum GcMode { // Enable only removing old-generation indices IndicesOnly, + // Enable all forms of GC - // TODO: this will be used when shard split ancestor layer deletion is added - // All, + Full, } impl std::fmt::Display for GcMode { @@ -36,21 +62,232 @@ impl std::fmt::Display for GcMode { match self { GcMode::DryRun => write!(f, "dry-run"), GcMode::IndicesOnly => write!(f, "indices-only"), + GcMode::Full => write!(f, "full"), } } } +mod refs { + use super::*; + // Map of cross-shard layer references, giving a refcount for each layer in each shard that is referenced by some other + // shard in the same tenant. This is sparse! The vast majority of timelines will have no cross-shard refs, and those that + // do have cross shard refs should eventually drop most of them via compaction. + // + // In our inner map type, the TTID in the key is shard-agnostic, and the ShardIndex in the value refers to the _ancestor + // which is is referenced_. + #[derive(Default)] + pub(super) struct AncestorRefs( + BTreeMap>, + ); + + impl AncestorRefs { + /// Insert references for layers discovered in a particular shard-timeline that refer to an ancestral shard-timeline. + pub(super) fn update( + &mut self, + ttid: TenantShardTimelineId, + layers: Vec<(LayerName, LayerFileMetadata)>, + ) { + let ttid_refs = self.0.entry(ttid.as_tenant_timeline_id()).or_default(); + for (layer_name, layer_metadata) in layers { + // Increment refcount of this layer in the ancestor shard + *(ttid_refs + .entry((layer_metadata.shard, layer_name)) + .or_default()) += 1; + } + } + + /// For a particular TTID, return the map of all ancestor layers referenced by a descendent to their refcount + /// + /// The `ShardIndex` in the result's key is the index of the _ancestor_, not the descendent. + pub(super) fn get_ttid_refcounts( + &self, + ttid: &TenantTimelineId, + ) -> Option<&HashMap<(ShardIndex, LayerName), usize>> { + self.0.get(ttid) + } + } +} + +use refs::AncestorRefs; + +// As we see shards for a tenant, acccumulate knowledge needed for cross-shard GC: +// - Are there any ancestor shards? +// - Are there any refs to ancestor shards' layers? +#[derive(Default)] +struct TenantRefAccumulator { + shards_seen: HashMap>, + + // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to + ancestor_ref_shards: AncestorRefs, +} + +impl TenantRefAccumulator { + fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) { + let this_shard_idx = ttid.tenant_shard_id.to_index(); + (*self + .shards_seen + .entry(ttid.tenant_shard_id.tenant_id) + .or_default()) + .insert(this_shard_idx); + + let mut ancestor_refs = Vec::new(); + for (layer_name, layer_metadata) in &index_part.layer_metadata { + if layer_metadata.shard != this_shard_idx { + // This is a reference from this shard to a layer in an ancestor shard: we must track this + // as a marker to not GC this layer from the parent. + ancestor_refs.push((layer_name.clone(), layer_metadata.clone())); + } + } + + if !ancestor_refs.is_empty() { + tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len()); + self.ancestor_ref_shards.update(ttid, ancestor_refs); + } + } + + /// Consume Self and return a vector of ancestor tenant shards that should be GC'd, and map of referenced ancestor layers to preserve + async fn into_gc_ancestors( + self, + controller_client: &control_api::Client, + summary: &mut GcSummary, + ) -> (Vec, AncestorRefs) { + let mut ancestors_to_gc = Vec::new(); + for (tenant_id, shard_indices) in self.shards_seen { + // Find the highest shard count + let latest_count = shard_indices + .iter() + .map(|i| i.shard_count) + .max() + .expect("Always at least one shard"); + + let mut shard_indices = shard_indices.iter().collect::>(); + let (mut latest_shards, ancestor_shards) = { + let at = + itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count); + (shard_indices[0..at].to_owned(), &shard_indices[at..]) + }; + // Sort shards, as we will later compare them with a sorted list from the controller + latest_shards.sort(); + + // Check that we have a complete view of the latest shard count: this should always be the case unless we happened + // to scan the S3 bucket halfway through a shard split. + if latest_shards.len() != latest_count.count() as usize { + // This should be extremely rare, so we warn on it. + tracing::warn!(%tenant_id, "Missed some shards at count {:?}: {latest_shards:?}", latest_count); + continue; + } + + // Check if we have any non-latest-count shards + if ancestor_shards.is_empty() { + tracing::debug!(%tenant_id, "No ancestor shards to clean up"); + continue; + } + + // Based on S3 view, this tenant looks like it might have some ancestor shard work to do. We + // must only do this work if the tenant is not currently being split: otherwise, it is not safe + // to GC ancestors, because if the split fails then the controller will try to attach ancestor + // shards again. + match controller_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await + { + Err(e) => { + // We were not able to learn the latest shard split state from the controller, so we will not + // do ancestor GC on this tenant. + tracing::warn!(%tenant_id, "Failed to query storage controller, will not do ancestor GC: {e}"); + summary.controller_api_errors += 1; + continue; + } + Ok(desc) => { + // We expect to see that the latest shard count matches the one we saw in S3, and that none + // of the shards indicate splitting in progress. + + let controller_indices: Vec = desc + .shards + .iter() + .map(|s| s.tenant_shard_id.to_index()) + .collect(); + if !controller_indices.iter().eq(latest_shards.iter().copied()) { + tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})"); + continue; + } + + if desc.shards.iter().any(|s| s.is_splitting) { + tracing::info!(%tenant_id, "One or more shards is currently splitting"); + continue; + } + + // This shouldn't be too noisy, because we only log this for tenants that have some ancestral refs. + tracing::info!(%tenant_id, "Validated state with controller: {desc:?}"); + } + } + + // GC ancestor shards + for ancestor_shard in ancestor_shards.iter().map(|idx| TenantShardId { + tenant_id, + shard_count: idx.shard_count, + shard_number: idx.shard_number, + }) { + ancestors_to_gc.push(ancestor_shard); + } + } + + (ancestors_to_gc, self.ancestor_ref_shards) + } +} + +fn is_old_enough(min_age: &Duration, key: &ListingObject, summary: &mut GcSummary) -> bool { + // Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident + // it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects. + let age = match key.last_modified.elapsed() { + Ok(e) => e, + Err(_) => { + tracing::warn!("Bad last_modified time: {:?}", key.last_modified); + summary.remote_storage_errors += 1; + return false; + } + }; + let old_enough = &age > min_age; + + if !old_enough { + tracing::info!( + "Skipping young object {} < {}", + humantime::format_duration(age), + humantime::format_duration(*min_age) + ); + } + + old_enough +} + +/// Same as [`is_old_enough`], but doesn't require a [`ListingObject`] passed to it. +async fn check_is_old_enough( + remote_client: &GenericRemoteStorage, + key: &RemotePath, + min_age: &Duration, + summary: &mut GcSummary, +) -> Option { + let listing_object = remote_client + .head_object(key, &CancellationToken::new()) + .await + .ok()?; + Some(is_old_enough(min_age, &listing_object, summary)) +} + async fn maybe_delete_index( - s3_client: &Client, - bucket_config: &BucketConfig, + remote_client: &GenericRemoteStorage, min_age: &Duration, latest_gen: Generation, - key: &str, + obj: &ListingObject, mode: GcMode, summary: &mut GcSummary, ) { // Validation: we will only delete things that parse cleanly - let basename = key.rsplit_once('/').unwrap().1; + let basename = obj.key.get_path().file_name().unwrap(); let candidate_generation = match parse_remote_index_path(RemotePath::from_string(basename).unwrap()) { Some(g) => g, @@ -79,45 +316,7 @@ async fn maybe_delete_index( return; } - // Validation: we will only delete indices after one week, so that during incidents we will have - // easy access to recent indices. - let age: Duration = match s3_client - .head_object() - .bucket(&bucket_config.bucket) - .key(key) - .send() - .await - { - Ok(response) => match response.last_modified { - None => { - tracing::warn!("Missing last_modified"); - summary.remote_storage_errors += 1; - return; - } - Some(last_modified) => { - let last_modified = - UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64()); - match last_modified.elapsed() { - Ok(e) => e, - Err(_) => { - tracing::warn!("Bad last_modified time: {last_modified:?}"); - return; - } - } - } - }, - Err(e) => { - tracing::warn!("Failed to HEAD {key}: {e}"); - summary.remote_storage_errors += 1; - return; - } - }; - if &age < min_age { - tracing::info!( - "Skipping young object {} < {}", - age.as_secs_f64(), - min_age.as_secs_f64() - ); + if !is_old_enough(min_age, obj, summary) { return; } @@ -127,11 +326,8 @@ async fn maybe_delete_index( } // All validations passed: erase the object - match s3_client - .delete_object() - .bucket(&bucket_config.bucket) - .key(key) - .send() + match remote_client + .delete(&obj.key, &CancellationToken::new()) .await { Ok(_) => { @@ -145,6 +341,105 @@ async fn maybe_delete_index( } } +#[allow(clippy::too_many_arguments)] +async fn gc_ancestor( + remote_client: &GenericRemoteStorage, + root_target: &RootTarget, + min_age: &Duration, + ancestor: TenantShardId, + refs: &AncestorRefs, + mode: GcMode, + summary: &mut GcSummary, +) -> anyhow::Result<()> { + // Scan timelines in the ancestor + let timelines = stream_tenant_timelines(remote_client, root_target, ancestor).await?; + let mut timelines = std::pin::pin!(timelines); + + // Build a list of keys to retain + + while let Some(ttid) = timelines.next().await { + let ttid = ttid?; + + let data = list_timeline_blobs(remote_client, ttid, root_target).await?; + + let s3_layers = match data.blob_data { + BlobDataParseResult::Parsed { + index_part: _, + index_part_generation: _, + s3_layers, + } => s3_layers, + BlobDataParseResult::Relic => { + // Post-deletion tenant location: don't try and GC it. + continue; + } + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, // TODO(yuchen): could still check references to these s3 layers? + } => { + // Our primary purpose isn't to report on bad data, but log this rather than skipping silently + tracing::warn!( + "Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}" + ); + continue; + } + }; + + let ttid_refs = refs.get_ttid_refcounts(&ttid.as_tenant_timeline_id()); + let ancestor_shard_index = ttid.tenant_shard_id.to_index(); + + for (layer_name, layer_gen) in s3_layers { + let ref_count = ttid_refs + .and_then(|m| m.get(&(ancestor_shard_index, layer_name.clone()))) + .copied() + .unwrap_or(0); + + if ref_count > 0 { + tracing::debug!(%ttid, "Ancestor layer {layer_name} has {ref_count} refs"); + continue; + } + + tracing::info!(%ttid, "Ancestor layer {layer_name} is not referenced"); + + // Build the key for the layer we are considering deleting + let key = root_target.absolute_key(&remote_layer_path( + &ttid.tenant_shard_id.tenant_id, + &ttid.timeline_id, + ancestor_shard_index, + &layer_name, + layer_gen, + )); + + // We apply a time threshold to GCing objects that are un-referenced: this preserves our ability + // to roll back a shard split if we have to, by avoiding deleting ancestor layers right away + let path = RemotePath::from_string(key.strip_prefix("/").unwrap_or(&key)).unwrap(); + if check_is_old_enough(remote_client, &path, min_age, summary).await != Some(true) { + continue; + } + + if !matches!(mode, GcMode::Full) { + tracing::info!("Dry run: would delete key {key}"); + continue; + } + + // All validations passed: erase the object + match remote_client.delete(&path, &CancellationToken::new()).await { + Ok(_) => { + tracing::info!("Successfully deleted unreferenced ancestor layer {key}"); + summary.ancestor_layers_deleted += 1; + } + Err(e) => { + tracing::warn!("Failed to delete layer {key}: {e}"); + summary.remote_storage_errors += 1; + } + } + } + + // TODO: if all the layers are gone, clean up the whole timeline dir (remove index) + } + + Ok(()) +} + /// Physical garbage collection: removing unused S3 objects. This is distinct from the garbage collection /// done inside the pageserver, which operates at a higher level (keys, layers). This type of garbage collection /// is about removing: @@ -155,84 +450,114 @@ async fn maybe_delete_index( /// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and /// make sure that object listings don't get slowed down by large numbers of garbage objects. pub async fn pageserver_physical_gc( - bucket_config: BucketConfig, - tenant_ids: Vec, + bucket_config: &BucketConfig, + controller_client: Option<&control_api::Client>, + tenant_shard_ids: Vec, min_age: Duration, mode: GcMode, ) -> anyhow::Result { - let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?; + let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; - let tenants = if tenant_ids.is_empty() { - futures::future::Either::Left(stream_tenants(&s3_client, &target)) + let tenants = if tenant_shard_ids.is_empty() { + futures::future::Either::Left(stream_tenants(&remote_client, &target)) } else { - futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok))) + futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok))) }; // How many tenants to process in parallel. We need to be mindful of pageservers // accessing the same per tenant prefixes, so use a lower setting than pageservers. const CONCURRENCY: usize = 32; + // Accumulate information about each tenant for cross-shard GC step we'll do at the end + let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default())); + // Generate a stream of TenantTimelineId - let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t)); + let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t)); let timelines = timelines.try_buffered(CONCURRENCY); let timelines = timelines.try_flatten(); // Generate a stream of S3TimelineBlobData async fn gc_timeline( - s3_client: &Client, - bucket_config: &BucketConfig, + remote_client: &GenericRemoteStorage, min_age: &Duration, target: &RootTarget, mode: GcMode, ttid: TenantShardTimelineId, + accumulator: &Arc>, ) -> anyhow::Result { let mut summary = GcSummary::default(); - let data = list_timeline_blobs(s3_client, ttid, target).await?; + let data = list_timeline_blobs(remote_client, ttid, target).await?; - let (latest_gen, candidates) = match &data.blob_data { + let (index_part, latest_gen, candidates) = match &data.blob_data { BlobDataParseResult::Parsed { - index_part: _index_part, + index_part, index_part_generation, s3_layers: _s3_layers, - } => (*index_part_generation, data.unused_index_keys), + } => (index_part, *index_part_generation, data.unused_index_keys), BlobDataParseResult::Relic => { // Post-deletion tenant location: don't try and GC it. return Ok(summary); } - BlobDataParseResult::Incorrect(reasons) => { + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => { // Our primary purpose isn't to report on bad data, but log this rather than skipping silently - tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}"); + tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}"); return Ok(summary); } }; + accumulator.lock().unwrap().update(ttid, index_part); + for key in candidates { - maybe_delete_index( - s3_client, - bucket_config, - min_age, - latest_gen, - &key, - mode, - &mut summary, - ) - .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, key)) - .await; + maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary) + .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key)) + .await; } Ok(summary) } - let timelines = timelines - .map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid)); - let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); let mut summary = GcSummary::default(); - while let Some(i) = timelines.next().await { - let tl_summary = i?; + // Drain futures for per-shard GC, populating accumulator as a side effect + { + let timelines = timelines.map_ok(|ttid| { + gc_timeline(&remote_client, &min_age, &target, mode, ttid, &accumulator) + }); + let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); - summary.indices_deleted += tl_summary.indices_deleted; - summary.remote_storage_errors += tl_summary.remote_storage_errors; + while let Some(i) = timelines.next().await { + summary.merge(i?); + } + } + + // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC + let Some(client) = controller_client else { + tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified"); + return Ok(summary); + }; + + let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator) + .unwrap() + .into_inner() + .unwrap() + .into_gc_ancestors(client, &mut summary) + .await; + + for ancestor_shard in ancestor_shards { + gc_ancestor( + &remote_client, + &target, + &min_age, + ancestor_shard, + &ancestor_refs, + mode, + &mut summary, + ) + .instrument(info_span!("gc_ancestor", %ancestor_shard)) + .await?; } Ok(summary) diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index af74ffa4cd..151ef27672 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -1,21 +1,21 @@ use std::collections::{HashMap, HashSet}; use crate::checks::{ - branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData, - TenantObjectListing, TimelineAnalysis, + branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, + RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis, }; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; -use aws_sdk_s3::Client; use futures_util::{StreamExt, TryStreamExt}; -use histogram::Histogram; use pageserver::tenant::remote_timeline_client::remote_layer_path; -use pageserver::tenant::IndexPart; +use pageserver_api::controller_api::MetadataHealthUpdateRequest; use pageserver_api::shard::TenantShardId; +use remote_storage::GenericRemoteStorage; use serde::Serialize; use utils::id::TenantId; +use utils::shard::ShardCount; -#[derive(Serialize)] +#[derive(Serialize, Default)] pub struct MetadataSummary { tenant_count: usize, timeline_count: usize, @@ -25,97 +25,18 @@ pub struct MetadataSummary { with_orphans: HashSet, indices_by_version: HashMap, - layer_count: MinMaxHisto, - timeline_size_bytes: MinMaxHisto, - layer_size_bytes: MinMaxHisto, -} - -/// A histogram plus minimum and maximum tracking -#[derive(Serialize)] -struct MinMaxHisto { #[serde(skip)] - histo: Histogram, - min: u64, - max: u64, -} - -impl MinMaxHisto { - fn new() -> Self { - Self { - histo: histogram::Histogram::builder() - .build() - .expect("Bad histogram params"), - min: u64::MAX, - max: 0, - } - } - - fn sample(&mut self, v: u64) -> Result<(), histogram::Error> { - self.min = std::cmp::min(self.min, v); - self.max = std::cmp::max(self.max, v); - let r = self.histo.increment(v, 1); - - if r.is_err() { - tracing::warn!("Bad histogram sample: {v}"); - } - - r - } - - fn oneline(&self) -> String { - let percentiles = match self.histo.percentiles(&[1.0, 10.0, 50.0, 90.0, 99.0]) { - Ok(p) => p, - Err(e) => return format!("No data: {}", e), - }; - - let percentiles: Vec = percentiles - .iter() - .map(|p| p.bucket().low() + p.bucket().high() / 2) - .collect(); - - format!( - "min {}, 1% {}, 10% {}, 50% {}, 90% {}, 99% {}, max {}", - self.min, - percentiles[0], - percentiles[1], - percentiles[2], - percentiles[3], - percentiles[4], - self.max, - ) - } + pub(crate) healthy_tenant_shards: HashSet, + #[serde(skip)] + pub(crate) unhealthy_tenant_shards: HashSet, } impl MetadataSummary { fn new() -> Self { - Self { - tenant_count: 0, - timeline_count: 0, - timeline_shard_count: 0, - with_errors: HashSet::new(), - with_warnings: HashSet::new(), - with_orphans: HashSet::new(), - indices_by_version: HashMap::new(), - layer_count: MinMaxHisto::new(), - timeline_size_bytes: MinMaxHisto::new(), - layer_size_bytes: MinMaxHisto::new(), - } + Self::default() } - fn update_histograms(&mut self, index_part: &IndexPart) -> Result<(), histogram::Error> { - self.layer_count - .sample(index_part.layer_metadata.len() as u64)?; - let mut total_size: u64 = 0; - for meta in index_part.layer_metadata.values() { - total_size += meta.file_size; - self.layer_size_bytes.sample(meta.file_size)?; - } - self.timeline_size_bytes.sample(total_size)?; - - Ok(()) - } - - fn update_data(&mut self, data: &S3TimelineBlobData) { + fn update_data(&mut self, data: &RemoteTimelineBlobData) { self.timeline_shard_count += 1; if let BlobDataParseResult::Parsed { index_part, @@ -127,18 +48,17 @@ impl MetadataSummary { .indices_by_version .entry(index_part.version()) .or_insert(0) += 1; - - if let Err(e) = self.update_histograms(index_part) { - // Value out of range? Warn that the results are untrustworthy - tracing::warn!( - "Error updating histograms, summary stats may be wrong: {}", - e - ); - } } } fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) { + if analysis.is_healthy() { + self.healthy_tenant_shards.insert(id.tenant_shard_id); + } else { + self.healthy_tenant_shards.remove(&id.tenant_shard_id); + self.unhealthy_tenant_shards.insert(id.tenant_shard_id); + } + if !analysis.errors.is_empty() { self.with_errors.insert(*id); } @@ -169,9 +89,6 @@ With errors: {} With warnings: {} With orphan layers: {} Index versions: {version_summary} -Timeline size bytes: {} -Layer size bytes: {} -Timeline layer count: {} ", self.tenant_count, self.timeline_count, @@ -179,9 +96,6 @@ Timeline layer count: {} self.with_errors.len(), self.with_warnings.len(), self.with_orphans.len(), - self.timeline_size_bytes.oneline(), - self.layer_size_bytes.oneline(), - self.layer_count.oneline(), ) } @@ -192,17 +106,24 @@ Timeline layer count: {} pub fn is_empty(&self) -> bool { self.timeline_shard_count == 0 } + + pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest { + MetadataHealthUpdateRequest { + healthy_tenant_shards: self.healthy_tenant_shards.clone(), + unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(), + } + } } /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics. -pub async fn scan_metadata( +pub async fn scan_pageserver_metadata( bucket_config: BucketConfig, tenant_ids: Vec, ) -> anyhow::Result { - let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?; + let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?; let tenants = if tenant_ids.is_empty() { - futures::future::Either::Left(stream_tenants(&s3_client, &target)) + futures::future::Either::Left(stream_tenants(&remote_client, &target)) } else { futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok))) }; @@ -212,20 +133,20 @@ pub async fn scan_metadata( const CONCURRENCY: usize = 32; // Generate a stream of TenantTimelineId - let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t)); + let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t)); let timelines = timelines.try_buffered(CONCURRENCY); let timelines = timelines.try_flatten(); // Generate a stream of S3TimelineBlobData async fn report_on_timeline( - s3_client: &Client, + remote_client: &GenericRemoteStorage, target: &RootTarget, ttid: TenantShardTimelineId, - ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> { - let data = list_timeline_blobs(s3_client, ttid, target).await?; + ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> { + let data = list_timeline_blobs(remote_client, ttid, target).await?; Ok((ttid, data)) } - let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid)); + let timelines = timelines.map_ok(|ttid| report_on_timeline(&remote_client, &target, ttid)); let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different @@ -235,33 +156,58 @@ pub async fn scan_metadata( let mut tenant_objects = TenantObjectListing::default(); let mut tenant_timeline_results = Vec::new(); - fn analyze_tenant( + async fn analyze_tenant( + remote_client: &GenericRemoteStorage, tenant_id: TenantId, summary: &mut MetadataSummary, mut tenant_objects: TenantObjectListing, - timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>, + timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>, + highest_shard_count: ShardCount, ) { summary.tenant_count += 1; let mut timeline_ids = HashSet::new(); let mut timeline_generations = HashMap::new(); for (ttid, data) in timelines { - timeline_ids.insert(ttid.timeline_id); - // Stash the generation of each timeline, for later use identifying orphan layers - if let BlobDataParseResult::Parsed { - index_part: _index_part, - index_part_generation, - s3_layers: _s3_layers, - } = &data.blob_data - { - timeline_generations.insert(ttid, *index_part_generation); - } + if ttid.tenant_shard_id.shard_count == highest_shard_count { + // Only analyze `TenantShardId`s with highest shard count. - // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects` - // reference counts for layers across the tenant. - let analysis = - branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data)); - summary.update_analysis(&ttid, &analysis); + // Stash the generation of each timeline, for later use identifying orphan layers + if let BlobDataParseResult::Parsed { + index_part, + index_part_generation, + s3_layers: _s3_layers, + } = &data.blob_data + { + if index_part.deleted_at.is_some() { + // skip deleted timeline. + tracing::info!("Skip analysis of {} b/c timeline is already deleted", ttid); + continue; + } + timeline_generations.insert(ttid, *index_part_generation); + } + + // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects` + // reference counts for layers across the tenant. + let analysis = branch_cleanup_and_check_errors( + remote_client, + &ttid, + &mut tenant_objects, + None, + None, + Some(data), + ) + .await; + summary.update_analysis(&ttid, &analysis); + + timeline_ids.insert(ttid.timeline_id); + } else { + tracing::info!( + "Skip analysis of {} b/c a lower shard count than {}", + ttid, + highest_shard_count.0, + ); + } } summary.timeline_count += timeline_ids.len(); @@ -309,40 +255,67 @@ pub async fn scan_metadata( // all results for the same tenant will be adjacent. We accumulate these, // and then call `analyze_tenant` to flush, when we see the next tenant ID. let mut summary = MetadataSummary::new(); + let mut highest_shard_count = ShardCount::MIN; while let Some(i) = timelines.next().await { let (ttid, data) = i?; summary.update_data(&data); match tenant_id { - None => tenant_id = Some(ttid.tenant_shard_id.tenant_id), + None => { + tenant_id = Some(ttid.tenant_shard_id.tenant_id); + highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count); + } Some(prev_tenant_id) => { if prev_tenant_id != ttid.tenant_shard_id.tenant_id { + // New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results let tenant_objects = std::mem::take(&mut tenant_objects); let timelines = std::mem::take(&mut tenant_timeline_results); - analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines); + analyze_tenant( + &remote_client, + prev_tenant_id, + &mut summary, + tenant_objects, + timelines, + highest_shard_count, + ) + .await; tenant_id = Some(ttid.tenant_shard_id.tenant_id); + highest_shard_count = ttid.tenant_shard_id.shard_count; + } else { + highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count); } } } - if let BlobDataParseResult::Parsed { - index_part: _index_part, - index_part_generation: _index_part_generation, - s3_layers, - } = &data.blob_data - { - tenant_objects.push(ttid, s3_layers.clone()); + match &data.blob_data { + BlobDataParseResult::Parsed { + index_part: _index_part, + index_part_generation: _index_part_generation, + s3_layers, + } => { + tenant_objects.push(ttid, s3_layers.clone()); + } + BlobDataParseResult::Relic => (), + BlobDataParseResult::Incorrect { + errors: _, + s3_layers, + } => { + tenant_objects.push(ttid, s3_layers.clone()); + } } tenant_timeline_results.push((ttid, data)); } if !tenant_timeline_results.is_empty() { analyze_tenant( + &remote_client, tenant_id.expect("Must be set if results are present"), &mut summary, tenant_objects, tenant_timeline_results, - ); + highest_shard_count, + ) + .await; } Ok(summary) diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 24051b03de..1a9f3d0ef5 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -1,10 +1,10 @@ use std::{collections::HashSet, str::FromStr, sync::Arc}; -use aws_sdk_s3::Client; use futures::stream::{StreamExt, TryStreamExt}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; use postgres_ffi::{XLogFileName, PG_TLI}; +use remote_storage::GenericRemoteStorage; use serde::Serialize; use tokio_postgres::types::PgLsn; use tracing::{error, info, trace}; @@ -106,7 +106,7 @@ pub async fn scan_safekeeper_metadata( let timelines = client.query(&query, &[]).await?; info!("loaded {} timelines", timelines.len()); - let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?; + let (remote_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?; let console_config = ConsoleConfig::from_env()?; let cloud_admin_api_client = CloudAdminApiClient::new(console_config); @@ -119,7 +119,7 @@ pub async fn scan_safekeeper_metadata( let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg)); let ttid = TenantTimelineId::new(tenant_id, timeline_id); check_timeline( - &s3_client, + &remote_client, &target, &cloud_admin_api_client, ttid, @@ -156,7 +156,7 @@ struct TimelineCheckResult { /// errors are logged to stderr; returns Ok(true) if timeline is consistent, /// Ok(false) if not, Err if failed to check. async fn check_timeline( - s3_client: &Client, + remote_client: &GenericRemoteStorage, root: &RootTarget, api_client: &CloudAdminApiClient, ttid: TenantTimelineId, @@ -187,13 +187,19 @@ async fn check_timeline( // we need files, so unset it. timeline_dir_target.delimiter = String::new(); - let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); + let prefix_str = &timeline_dir_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&timeline_dir_target.prefix_in_bucket); + + let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target)); while let Some(obj) = stream.next().await { - let obj = obj?; - let key = obj.key(); + let (key, _obj) = obj?; let seg_name = key - .strip_prefix(&timeline_dir_target.prefix_in_bucket) + .get_path() + .as_str() + .strip_prefix(prefix_str) .expect("failed to extract segment name"); expected_segfiles.remove(seg_name); } diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 450b337235..bb4079b5f4 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -1,10 +1,11 @@ use std::collections::HashMap; use std::sync::Arc; -use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData}; +use crate::checks::{list_timeline_blobs, BlobDataParseResult, RemoteTimelineBlobData}; use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; use crate::{ - download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, + download_object_to_file_s3, init_remote, init_remote_s3, BucketConfig, NodeKind, RootTarget, + TenantShardTimelineId, }; use anyhow::Context; use async_stream::stream; @@ -15,6 +16,7 @@ use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; +use remote_storage::GenericRemoteStorage; use utils::generation::Generation; use utils::id::TenantId; @@ -28,13 +30,14 @@ pub struct SnapshotDownloader { } impl SnapshotDownloader { - pub fn new( + pub async fn new( bucket_config: BucketConfig, tenant_id: TenantId, output_path: Utf8PathBuf, concurrency: usize, ) -> anyhow::Result { - let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?; + let (s3_client, s3_root) = + init_remote_s3(bucket_config.clone(), NodeKind::Pageserver).await?; Ok(Self { s3_client, s3_root, @@ -91,7 +94,7 @@ impl SnapshotDownloader { let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else { return Err(anyhow::anyhow!("No versions found for {remote_layer_path}")); }; - download_object_to_file( + download_object_to_file_s3( &self.s3_client, &self.bucket_config.bucket, &remote_layer_path, @@ -215,10 +218,11 @@ impl SnapshotDownloader { } pub async fn download(&self) -> anyhow::Result<()> { - let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?; + let (remote_client, target) = + init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?; // Generate a stream of TenantShardId - let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?; + let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?; let shards: Vec = shards.try_collect().await?; // Only read from shards that have the highest count: avoids redundantly downloading @@ -236,18 +240,19 @@ impl SnapshotDownloader { for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { // Generate a stream of TenantTimelineId - let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?; + let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?; // Generate a stream of S3TimelineBlobData async fn load_timeline_index( - s3_client: &Client, + remote_client: &GenericRemoteStorage, target: &RootTarget, ttid: TenantShardTimelineId, - ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> { - let data = list_timeline_blobs(s3_client, ttid, target).await?; + ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> { + let data = list_timeline_blobs(remote_client, ttid, target).await?; Ok((ttid, data)) } - let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid)); + let timelines = + timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid)); let mut timelines = std::pin::pin!(timelines.try_buffered(8)); while let Some(i) = timelines.next().await { @@ -268,7 +273,7 @@ impl SnapshotDownloader { .context("Downloading timeline")?; } BlobDataParseResult::Relic => {} - BlobDataParseResult::Incorrect(_) => { + BlobDataParseResult::Incorrect { .. } => { tracing::error!("Bad metadata in timeline {ttid}"); } }; @@ -277,7 +282,7 @@ impl SnapshotDownloader { for (ttid, layers) in ancestor_layers.into_iter() { tracing::info!( - "Downloading {} layers from ancvestor timeline {ttid}...", + "Downloading {} layers from ancestor timeline {ttid}...", layers.len() ); diff --git a/test_runner/README.md b/test_runner/README.md index 7d95634ea8..73aa29d4bb 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -71,8 +71,7 @@ a subdirectory for each version with naming convention `v{PG_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. `DEFAULT_PG_VERSION`: The version of Postgres to use, This is used to construct full path to the postgres binaries. -Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"`. Alternatively, -you can use `--pg-version` argument. +Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16` `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. @@ -81,7 +80,7 @@ should go. Useful parameters and commands: `--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk -after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. +after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. If `NeonEnvBuilder#preserve_database_files` set to `True` for a particular test, the whole `repo` directory will be attached to Allure report (thus uploaded to S3) as `everything.tar.zst` for this test. Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them: `./scripts/pytest -s --log-cli-level=INFO ...` diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 4b0c9ac71d..996ca4d652 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -3,6 +3,7 @@ pytest_plugins = ( "fixtures.parametrize", "fixtures.httpserver", "fixtures.compute_reconfigure", + "fixtures.storage_controller_proxy", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 038f557cc8..0c36cd6ef7 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -222,6 +222,8 @@ class NeonBenchmarker: function by the zenbenchmark fixture """ + PROPERTY_PREFIX = "neon_benchmarker_" + def __init__(self, property_recorder: Callable[[str, object], None]): # property recorder here is a pytest fixture provided by junitxml module # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property @@ -238,7 +240,7 @@ class NeonBenchmarker: Record a benchmark result. """ # just to namespace the value - name = f"neon_benchmarker_{metric_name}" + name = f"{self.PROPERTY_PREFIX}_{metric_name}" self.property_recorder( name, { @@ -249,6 +251,18 @@ class NeonBenchmarker: }, ) + @classmethod + def records( + cls, user_properties: list[tuple[str, object]] + ) -> Iterator[tuple[str, dict[str, object]]]: + """ + Yield all records related to benchmarks + """ + for property_name, recorded_property in user_properties: + if property_name.startswith(cls.PROPERTY_PREFIX): + assert isinstance(recorded_property, dict) + yield recorded_property["name"], recorded_property + @contextmanager def record_duration(self, metric_name: str) -> Iterator[None]: """ @@ -425,10 +439,11 @@ def zenbenchmark( yield benchmarker results = {} - for _, recorded_property in request.node.user_properties: + for _, recorded_property in NeonBenchmarker.records(request.node.user_properties): name = recorded_property["name"] value = str(recorded_property["value"]) - if (unit := recorded_property["unit"].strip()) != "": + unit = str(recorded_property["unit"]).strip() + if unit != "": value += f" {unit}" results[name] = value @@ -477,7 +492,7 @@ def pytest_terminal_summary( for test_report in terminalreporter.stats.get("passed", []): result_entry = [] - for _, recorded_property in test_report.user_properties: + for _, recorded_property in NeonBenchmarker.records(test_report.user_properties): if not is_header_printed: terminalreporter.section("Benchmark results", "-") is_header_printed = True diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 147264762c..7cadcbb4c2 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -1,7 +1,7 @@ import random from dataclasses import dataclass from functools import total_ordering -from typing import Any, Type, TypeVar, Union +from typing import Any, Dict, Type, TypeVar, Union T = TypeVar("T", bound="Id") @@ -143,6 +143,22 @@ class TimelineId(Id): def __repr__(self) -> str: return f'TimelineId("{self.id.hex()}")' + def __str__(self) -> str: + return self.id.hex() + + +@dataclass +class TenantTimelineId: + tenant_id: TenantId + timeline_id: TimelineId + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> "TenantTimelineId": + return TenantTimelineId( + tenant_id=TenantId(d["tenant_id"]), + timeline_id=TimelineId(d["timeline_id"]), + ) + # Workaround for compat with python 3.9, which does not have `typing.Self` TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId") diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 429b6af548..5fe544b3bd 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -42,7 +42,11 @@ class PgCompare(ABC): pass @abstractmethod - def flush(self): + def flush(self, compact: bool = False, gc: bool = False): + pass + + @abstractmethod + def compact(self): pass @abstractmethod @@ -109,8 +113,6 @@ class NeonCompare(PgCompare): # Create tenant tenant_conf: Dict[str, str] = {} - if False: # TODO add pytest setting for this - tenant_conf["trace_read_requests"] = "true" self.tenant, _ = self.env.neon_cli.create_tenant(conf=tenant_conf) # Create timeline @@ -131,13 +133,16 @@ class NeonCompare(PgCompare): def pg_bin(self) -> PgBin: return self._pg_bin - def flush(self): + def flush(self, compact: bool = True, gc: bool = True): wait_for_last_flush_lsn(self.env, self._pg, self.tenant, self.timeline) - self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline) - self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0) + self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline, compact=compact) + if gc: + self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0) def compact(self): - self.pageserver_http_client.timeline_compact(self.tenant, self.timeline) + self.pageserver_http_client.timeline_compact( + self.tenant, self.timeline, wait_until_uploaded=True + ) def report_peak_memory_use(self): self.zenbenchmark.record( @@ -217,9 +222,12 @@ class VanillaCompare(PgCompare): def pg_bin(self) -> PgBin: return self._pg.pg_bin - def flush(self): + def flush(self, compact: bool = False, gc: bool = False): self.cur.execute("checkpoint") + def compact(self): + pass + def report_peak_memory_use(self): pass # TODO find something @@ -268,6 +276,9 @@ class RemoteCompare(PgCompare): # TODO: flush the remote pageserver pass + def compact(self): + pass + def report_peak_memory_use(self): # TODO: get memory usage from remote pageserver pass diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 8b8075f8c1..cda70be8da 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -118,8 +118,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "libmetrics_launch_timestamp", "libmetrics_build_info", "libmetrics_tracing_event_count_total", - "pageserver_materialized_cache_hits_total", - "pageserver_materialized_cache_hits_direct_total", "pageserver_page_cache_read_hits_total", "pageserver_page_cache_read_accesses_total", "pageserver_page_cache_size_current_bytes", @@ -135,6 +133,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( *histogram("pageserver_remote_operation_seconds"), *histogram("pageserver_io_operations_seconds"), "pageserver_tenant_states_count", + "pageserver_circuit_breaker_broken_total", + "pageserver_circuit_breaker_unbroken_total", ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( @@ -146,11 +146,17 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", + "pageserver_archive_size", + "pageserver_pitr_history_size", + "pageserver_layer_bytes", + "pageserver_layer_count", + "pageserver_visible_physical_size", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total", "pageserver_evictions_with_low_residence_duration_total", "pageserver_aux_file_estimated_size", + "pageserver_valid_lsn_lease_count", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, # "pageserver_directory_entries_count", -- only used if above a certain threshold # "pageserver_broken_tenants_count" -- used only for broken diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py new file mode 100644 index 0000000000..0636cfad06 --- /dev/null +++ b/test_runner/fixtures/neon_api.py @@ -0,0 +1,307 @@ +from __future__ import annotations + +import time +from typing import TYPE_CHECKING, cast + +import requests + +if TYPE_CHECKING: + from typing import Any, Dict, Literal, Optional, Union + + from fixtures.pg_version import PgVersion + + +def connection_parameters_to_env(params: Dict[str, str]) -> Dict[str, str]: + return { + "PGHOST": params["host"], + "PGDATABASE": params["database"], + "PGUSER": params["role"], + "PGPASSWORD": params["password"], + } + + +class NeonAPI: + def __init__(self, neon_api_key: str, neon_api_base_url: str): + self.__neon_api_key = neon_api_key + self.__neon_api_base_url = neon_api_base_url.strip("/") + + def __request( + self, method: Union[str, bytes], endpoint: str, **kwargs: Any + ) -> requests.Response: + if "headers" not in kwargs: + kwargs["headers"] = {} + kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}" + + return requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs) + + def create_project( + self, + pg_version: Optional[PgVersion] = None, + name: Optional[str] = None, + branch_name: Optional[str] = None, + branch_role_name: Optional[str] = None, + branch_database_name: Optional[str] = None, + ) -> Dict[str, Any]: + data: Dict[str, Any] = { + "project": { + "branch": {}, + }, + } + if name: + data["project"]["name"] = name + if pg_version: + data["project"]["pg_version"] = int(pg_version) + if branch_name: + data["project"]["branch"]["name"] = branch_name + if branch_role_name: + data["project"]["branch"]["role_name"] = branch_role_name + if branch_database_name: + data["project"]["branch"]["database_name"] = branch_database_name + + resp = self.__request( + "POST", + "/projects", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + json=data, + ) + + assert resp.status_code == 201 + + return cast("Dict[str, Any]", resp.json()) + + def get_project_details(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + ) + assert resp.status_code == 200 + return cast("Dict[str, Any]", resp.json()) + + def delete_project( + self, + project_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "DELETE", + f"/projects/{project_id}", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def start_endpoint( + self, + project_id: str, + endpoint_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints/{endpoint_id}/start", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def suspend_endpoint( + self, + project_id: str, + endpoint_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints/{endpoint_id}/suspend", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def restart_endpoint( + self, + project_id: str, + endpoint_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints/{endpoint_id}/restart", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def create_endpoint( + self, + project_id: str, + branch_id: str, + endpoint_type: Literal["read_write", "read_only"], + settings: Dict[str, Any], + ) -> Dict[str, Any]: + data: Dict[str, Any] = { + "endpoint": { + "branch_id": branch_id, + }, + } + + if endpoint_type: + data["endpoint"]["type"] = endpoint_type + if settings: + data["endpoint"]["settings"] = settings + + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + json=data, + ) + + assert resp.status_code == 201 + + return cast("Dict[str, Any]", resp.json()) + + def get_connection_uri( + self, + project_id: str, + branch_id: Optional[str] = None, + endpoint_id: Optional[str] = None, + database_name: str = "neondb", + role_name: str = "neondb_owner", + pooled: bool = True, + ) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/connection_uri", + params={ + "branch_id": branch_id, + "endpoint_id": endpoint_id, + "database_name": database_name, + "role_name": role_name, + "pooled": pooled, + }, + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def get_branches(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/branches", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def get_endpoints(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/endpoints", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def get_operations(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/operations", + headers={ + "Accept": "application/json", + "Authorization": f"Bearer {self.__neon_api_key}", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def wait_for_operation_to_finish(self, project_id: str): + has_running = True + while has_running: + has_running = False + operations = self.get_operations(project_id)["operations"] + for op in operations: + if op["status"] in {"scheduling", "running", "cancelling"}: + has_running = True + time.sleep(0.5) + + +class NeonApiEndpoint: + def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]): + self.neon_api = neon_api + if project_id is None: + project = neon_api.create_project(pg_version) + neon_api.wait_for_operation_to_finish(project["project"]["id"]) + self.project_id = project["project"]["id"] + self.endpoint_id = project["endpoints"][0]["id"] + self.connstr = project["connection_uris"][0]["connection_uri"] + self.pgbench_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + self.is_new = True + else: + project = neon_api.get_project_details(project_id) + if int(project["project"]["pg_version"]) != int(pg_version): + raise Exception( + f"A project with the provided ID exists, but it's not of the specified version (expected {pg_version}, got {project['project']['pg_version']})" + ) + self.project_id = project_id + eps = neon_api.get_endpoints(project_id)["endpoints"] + self.endpoint_id = eps[0]["id"] + self.connstr = neon_api.get_connection_uri( + project_id, endpoint_id=self.endpoint_id, pooled=False + )["uri"] + pw = self.connstr.split("@")[0].split(":")[-1] + self.pgbench_env = { + "PGHOST": eps[0]["host"], + "PGDATABASE": "neondb", + "PGUSER": "neondb_owner", + "PGPASSWORD": pw, + } + self.is_new = False + + def restart(self): + self.neon_api.restart_endpoint(self.project_id, self.endpoint_id) + self.neon_api.wait_for_operation_to_finish(self.project_id) + + def get_synthetic_storage_size(self) -> int: + return int( + self.neon_api.get_project_details(self.project_id)["project"]["synthetic_storage_size"] + ) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 49857d5151..2bb698f175 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,6 +14,7 @@ import textwrap import threading import time import uuid +from collections import defaultdict from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime @@ -23,7 +24,7 @@ from functools import cached_property, partial from itertools import chain, product from pathlib import Path from types import TracebackType -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast +from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast from urllib.parse import quote, urlparse import asyncpg @@ -31,6 +32,7 @@ import backoff import httpx import jwt import psycopg2 +import psycopg2.sql import pytest import requests import toml @@ -59,12 +61,11 @@ from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_lay from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( wait_for_last_record_lsn, - wait_for_upload, - wait_for_upload_queue_empty, ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( + LocalFsStorage, MockS3Server, RemoteStorage, RemoteStorageKind, @@ -87,6 +88,8 @@ from fixtures.utils import ( ) from fixtures.utils import AuxFileStore as AuxFileStore # reexport +from .neon_api import NeonAPI, NeonApiEndpoint + """ This file contains pytest fixtures. A fixture is a test resource that can be summoned by placing its name in the test's arguments. @@ -184,6 +187,25 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Ite yield versioned_dir +@pytest.fixture(scope="session") +def neon_api_key() -> str: + api_key = os.getenv("NEON_API_KEY") + if not api_key: + raise AssertionError("Set the NEON_API_KEY environment variable") + + return api_key + + +@pytest.fixture(scope="session") +def neon_api_base_url() -> str: + return os.getenv("NEON_API_BASE_URL", "https://console-stage.neon.build/api/v2") + + +@pytest.fixture(scope="session") +def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI: + return NeonAPI(neon_api_key, neon_api_base_url) + + def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]: """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar. @@ -365,7 +387,7 @@ class PgProtocol: return self.safe_psql_many([query], **kwargs)[0] def safe_psql_many( - self, queries: List[str], log_query=True, **kwargs: Any + self, queries: Iterable[str], log_query=True, **kwargs: Any ) -> List[List[Tuple[Any, ...]]]: """ Execute queries against the node and return all rows. @@ -427,6 +449,7 @@ class TokenScope(str, Enum): GENERATIONS_API = "generations_api" SAFEKEEPER_DATA = "safekeeperdata" TENANT = "tenant" + SCRUBBER = "scrubber" class NeonEnvBuilder: @@ -471,6 +494,8 @@ class NeonEnvBuilder: pageserver_virtual_file_io_engine: Optional[str] = None, pageserver_aux_file_policy: Optional[AuxFileStore] = None, pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, + safekeeper_extra_opts: Optional[list[str]] = None, + storage_controller_port_override: Optional[int] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -500,7 +525,7 @@ class NeonEnvBuilder: self.preserve_database_files = preserve_database_files self.initial_tenant = initial_tenant or TenantId.generate() self.initial_timeline = initial_timeline or TimelineId.generate() - self.scrub_on_exit = False + self.enable_scrub_on_exit = True self.test_output_dir = test_output_dir self.test_overlay_dir = test_overlay_dir self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = [] @@ -519,23 +544,12 @@ class NeonEnvBuilder: f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}" ) - self.pageserver_get_vectored_impl: Optional[str] = None - if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored": - self.pageserver_get_vectored_impl = "vectored" - log.debug('Overriding pageserver get_vectored_impl config to "vectored"') - - self.pageserver_get_impl: Optional[str] = None - if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored": - self.pageserver_get_impl = "vectored" - log.debug('Overriding pageserver get_impl config to "vectored"') - - self.pageserver_validate_vectored_get: Optional[bool] = None - if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None: - self.pageserver_validate_vectored_get = bool(validate) - log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"') - self.pageserver_aux_file_policy = pageserver_aux_file_policy + self.safekeeper_extra_opts = safekeeper_extra_opts + + self.storage_controller_port_override = storage_controller_port_override + assert test_name.startswith( "test_" ), "Unexpectedly instantiated from outside a test function" @@ -581,7 +595,7 @@ class NeonEnvBuilder: timeline_id=env.initial_timeline, shard_count=initial_tenant_shard_count, shard_stripe_size=initial_tenant_shard_stripe_size, - aux_file_v2=self.pageserver_aux_file_policy, + aux_file_policy=self.pageserver_aux_file_policy, ) assert env.initial_tenant == initial_tenant assert env.initial_timeline == initial_timeline @@ -703,8 +717,30 @@ class NeonEnvBuilder: self.repo_dir / "local_fs_remote_storage", ) - if (attachments_json := Path(repo_dir / "attachments.json")).exists(): - shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name) + # restore storage controller (the db is small, don't bother with overlayfs) + storcon_db_from_dir = repo_dir / "storage_controller_db" + storcon_db_to_dir = self.repo_dir / "storage_controller_db" + log.info(f"Copying storage_controller_db from {storcon_db_from_dir} to {storcon_db_to_dir}") + assert storcon_db_from_dir.is_dir() + assert not storcon_db_to_dir.exists() + + def ignore_postgres_log(path: str, _names): + if Path(path) == storcon_db_from_dir: + return {"postgres.log"} + return set() + + shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log) + assert not (storcon_db_to_dir / "postgres.log").exists() + # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it. + # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller + # will currently reject re-attach requests from them because the NodeMetadata isn't identical. + # So, from_repo_dir patches up the the storcon database. + patch_script_path = self.repo_dir / "storage_controller_db.startup.sql" + assert not patch_script_path.exists() + patch_script = "" + for ps in self.env.pageservers: + patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';" + patch_script_path.write_text(patch_script) # Update the config with info about tenants and timelines with (self.repo_dir / "config").open("r") as f: @@ -805,6 +841,13 @@ class NeonEnvBuilder: ) ident_state_dir.rmdir() # should be empty since we moved `upper` out + def disable_scrub_on_exit(self): + """ + Some tests intentionally leave the remote storage contents empty or corrupt, + so it doesn't make sense to do the usual scrub at the end of the test. + """ + self.enable_scrub_on_exit = False + def overlay_cleanup_teardown(self): """ Unmount the overlayfs mounts created by `self.overlay_mount()`. @@ -830,23 +873,6 @@ class NeonEnvBuilder: # assert all overlayfs mounts in our test directory are gone assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir)) - def enable_scrub_on_exit(self): - """ - Call this if you would like the fixture to automatically run - storage_scrubber at the end of the test, as a bidirectional test - that the scrubber is working properly, and that the code within - the test didn't produce any invalid remote state. - """ - - if not isinstance(self.pageserver_remote_storage, S3Storage): - # The scrubber can't talk to e.g. LocalFS -- it needs - # an HTTP endpoint (mock is fine) to connect to. - raise RuntimeError( - "Cannot scrub with remote_storage={self.pageserver_remote_storage}, require an S3 endpoint" - ) - - self.scrub_on_exit = True - def enable_pageserver_remote_storage( self, remote_storage_kind: RemoteStorageKind, @@ -939,16 +965,26 @@ class NeonEnvBuilder: if self.env: log.info("Cleaning up all storage and compute nodes") self.env.stop( - immediate=True, + immediate=False, # if the test threw an exception, don't check for errors # as a failing assertion would cause the cleanup below to fail ps_assert_metric_no_errors=(exc_type is None), + # do not fail on endpoint errors to allow the rest of cleanup to proceed + fail_on_endpoint_errors=False, ) cleanup_error = None - if self.scrub_on_exit: + # If we are running with S3Storage (required by the scrubber), check that whatever the test + # did does not generate any corruption + if ( + isinstance(self.env.pageserver_remote_storage, S3Storage) + and self.enable_scrub_on_exit + ): try: - StorageScrubber(self).scan_metadata() + healthy, _ = self.env.storage_scrubber.scan_metadata() + if not healthy: + e = Exception("Remote storage metadata corrupted") + cleanup_error = e except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -1019,6 +1055,7 @@ class NeonEnv: """ BASE_PAGESERVER_ID = 1 + storage_controller: NeonStorageController | NeonProxiedStorageController def __init__(self, config: NeonEnvBuilder): self.repo_dir = config.repo_dir @@ -1049,27 +1086,41 @@ class NeonEnv: self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline - # Find two adjacent ports for storage controller and its postgres DB. This - # loop would eventually throw from get_port() if we run out of ports (extremely - # unlikely): usually we find two adjacent free ports on the first iteration. - while True: - self.storage_controller_port = self.port_distributor.get_port() - storage_controller_pg_port = self.port_distributor.get_port() - if storage_controller_pg_port == self.storage_controller_port + 1: - break - # The URL for the pageserver to use as its control_plane_api config - self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1" - # The base URL of the storage controller - self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}" + if config.storage_controller_port_override is not None: + log.info( + f"Using storage controller api override {config.storage_controller_port_override}" + ) + + self.storage_controller_port = config.storage_controller_port_override + self.storage_controller = NeonProxiedStorageController( + self, config.storage_controller_port_override, config.auth_enabled + ) + else: + # Find two adjacent ports for storage controller and its postgres DB. This + # loop would eventually throw from get_port() if we run out of ports (extremely + # unlikely): usually we find two adjacent free ports on the first iteration. + while True: + storage_controller_port = self.port_distributor.get_port() + storage_controller_pg_port = self.port_distributor.get_port() + if storage_controller_pg_port == storage_controller_port + 1: + break + + self.storage_controller_port = storage_controller_port + self.storage_controller = NeonStorageController( + self, storage_controller_port, config.auth_enabled + ) + + log.info( + f"Using generated control_plane_api: {self.storage_controller.upcall_api_endpoint()}" + ) + + self.storage_controller_api: str = self.storage_controller.api_root() + self.control_plane_api: str = self.storage_controller.upcall_api_endpoint() # For testing this with a fake HTTP server, enable passing through a URL from config self.control_plane_compute_hook_api = config.control_plane_compute_hook_api - self.storage_controller: NeonStorageController = NeonStorageController( - self, config.auth_enabled - ) - self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_aux_file_policy = config.pageserver_aux_file_policy @@ -1112,12 +1163,6 @@ class NeonEnv: } if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine - if config.pageserver_get_vectored_impl is not None: - ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl - if config.pageserver_get_impl is not None: - ps_cfg["get_impl"] = config.pageserver_get_impl - if config.pageserver_validate_vectored_get is not None: - ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get if config.pageserver_default_tenant_config_compaction_algorithm is not None: tenant_config = ps_cfg.setdefault("tenant_config", {}) tenant_config[ @@ -1167,20 +1212,27 @@ class NeonEnv: if config.auth_enabled: sk_cfg["auth_enabled"] = True if self.safekeepers_remote_storage is not None: - sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table() - self.safekeepers.append(Safekeeper(env=self, id=id, port=port)) + sk_cfg[ + "remote_storage" + ] = self.safekeepers_remote_storage.to_toml_inline_table().strip() + self.safekeepers.append( + Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts) + ) cfg["safekeepers"].append(sk_cfg) + # Scrubber instance for tests that use it, and for use during teardown checks + self.storage_scrubber = StorageScrubber(self, log_dir=config.test_output_dir) + log.info(f"Config: {cfg}") self.neon_cli.init( cfg, force=config.config_init_force, ) - def start(self): + def start(self, timeout_in_seconds: Optional[int] = None): # Storage controller starts first, so that pageserver /re-attach calls don't # bounce through retries on startup - self.storage_controller.start() + self.storage_controller.start(timeout_in_seconds=timeout_in_seconds) # Wait for storage controller readiness to prevent unnecessary post start-up # reconcile. @@ -1196,32 +1248,76 @@ class NeonEnv: ) # The `or None` is for the linter for pageserver in self.pageservers: - futs.append(executor.submit(lambda ps=pageserver: ps.start())) + futs.append( + executor.submit( + lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) + ) + ) for safekeeper in self.safekeepers: - futs.append(executor.submit(lambda sk=safekeeper: sk.start())) + futs.append( + executor.submit( + lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) + ) + ) for f in futs: f.result() - def stop(self, immediate=False, ps_assert_metric_no_errors=False): + def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): """ After this method returns, there should be no child processes running. + + Unless of course, some stopping failed, in that case, all remaining child processes are leaked. """ - self.endpoints.stop_all() + + # the commonly failing components have special try-except behavior, + # trying to get us to actually shutdown all processes over easier error + # reporting. + + raise_later = None + try: + self.endpoints.stop_all(fail_on_endpoint_errors) + except Exception as e: + raise_later = e # Stop storage controller before pageservers: we don't want it to spuriously # detect a pageserver "failure" during test teardown self.storage_controller.stop(immediate=immediate) + stop_later = [] + metric_errors = [] + for sk in self.safekeepers: sk.stop(immediate=immediate) for pageserver in self.pageservers: if ps_assert_metric_no_errors: - pageserver.assert_no_metric_errors() - pageserver.stop(immediate=immediate) + try: + pageserver.assert_no_metric_errors() + except Exception as e: + metric_errors.append(e) + log.error(f"metric validation failed on {pageserver.id}: {e}") + try: + pageserver.stop(immediate=immediate) + except RuntimeError: + stop_later.append(pageserver) self.broker.stop(immediate=immediate) + # TODO: for nice logging we need python 3.11 ExceptionGroup + for ps in stop_later: + ps.stop(immediate=True) + + if raise_later is not None: + raise raise_later + + for error in metric_errors: + raise error + + if len(stop_later) > 0: + raise RuntimeError( + f"{len(stop_later)} out of {len(self.pageservers)} pageservers failed to stop gracefully" + ) + @property def pageserver(self) -> NeonPageserver: """ @@ -1355,7 +1451,7 @@ def _shared_simple_env( pg_distrib_dir=pg_distrib_dir, pg_version=pg_version, run_id=run_id, - preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), test_name=request.node.name, test_output_dir=test_output_dir, pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, @@ -1402,6 +1498,7 @@ def neon_env_builder( pageserver_virtual_file_io_engine: str, pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], pageserver_aux_file_policy: Optional[AuxFileStore], + record_property: Callable[[str, object], None], ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1430,7 +1527,7 @@ def neon_env_builder( pg_version=pg_version, broker=default_broker, run_id=run_id, - preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, test_name=request.node.name, test_output_dir=test_output_dir, @@ -1439,6 +1536,9 @@ def neon_env_builder( pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, ) as builder: yield builder + # Propogate `preserve_database_files` to make it possible to use in other fixtures, + # like `test_output_dir` fixture for attaching all database files to Allure report. + record_property("preserve_database_files", builder.preserve_database_files) @dataclass @@ -1596,7 +1696,7 @@ class NeonCli(AbstractNeonCli): shard_stripe_size: Optional[int] = None, placement_policy: Optional[str] = None, set_default: bool = False, - aux_file_v2: Optional[AuxFileStore] = None, + aux_file_policy: Optional[AuxFileStore] = None, ) -> Tuple[TenantId, TimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. @@ -1621,13 +1721,11 @@ class NeonCli(AbstractNeonCli): ) ) - if aux_file_v2 is AuxFileStore.V2: + if aux_file_policy is AuxFileStore.V2: args.extend(["-c", "switch_aux_file_policy:v2"]) - - if aux_file_v2 is AuxFileStore.V1: + elif aux_file_policy is AuxFileStore.V1: args.extend(["-c", "switch_aux_file_policy:v1"]) - - if aux_file_v2 is AuxFileStore.CrossValidation: + elif aux_file_policy is AuxFileStore.CrossValidation: args.extend(["-c", "switch_aux_file_policy:cross-validation"]) if set_default: @@ -1783,22 +1881,38 @@ class NeonCli(AbstractNeonCli): res.check_returncode() return res - def storage_controller_start(self): + def storage_controller_start( + self, + timeout_in_seconds: Optional[int] = None, + instance_id: Optional[int] = None, + base_port: Optional[int] = None, + ): cmd = ["storage_controller", "start"] + if timeout_in_seconds is not None: + cmd.append(f"--start-timeout={timeout_in_seconds}s") + if instance_id is not None: + cmd.append(f"--instance-id={instance_id}") + if base_port is not None: + cmd.append(f"--base-port={base_port}") return self.raw_cli(cmd) - def storage_controller_stop(self, immediate: bool): + def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None): cmd = ["storage_controller", "stop"] if immediate: cmd.extend(["-m", "immediate"]) + if instance_id is not None: + cmd.append(f"--instance-id={instance_id}") return self.raw_cli(cmd) def pageserver_start( self, id: int, extra_env_vars: Optional[Dict[str, str]] = None, + timeout_in_seconds: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": start_args = ["pageserver", "start", f"--id={id}"] + if timeout_in_seconds is not None: + start_args.append(f"--start-timeout={timeout_in_seconds}s") storage = self.env.pageserver_remote_storage if isinstance(storage, S3Storage): @@ -1816,7 +1930,10 @@ class NeonCli(AbstractNeonCli): return self.raw_cli(cmd) def safekeeper_start( - self, id: int, extra_opts: Optional[List[str]] = None + self, + id: int, + extra_opts: Optional[List[str]] = None, + timeout_in_seconds: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": s3_env_vars = None if isinstance(self.env.safekeepers_remote_storage, S3Storage): @@ -1826,6 +1943,8 @@ class NeonCli(AbstractNeonCli): extra_opts = [f"-e={opt}" for opt in extra_opts] else: extra_opts = [] + if timeout_in_seconds is not None: + extra_opts.append(f"--start-timeout={timeout_in_seconds}s") return self.raw_cli( ["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars ) @@ -1888,11 +2007,15 @@ class NeonCli(AbstractNeonCli): remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, allow_multiple=False, + basebackup_request_tries: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", "start", ] + extra_env_vars = {} + if basebackup_request_tries is not None: + extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) if remote_ext_config is not None: args.extend(["--remote-ext-config", remote_ext_config]) @@ -1905,7 +2028,7 @@ class NeonCli(AbstractNeonCli): if allow_multiple: args.extend(["--allow-multiple"]) - res = self.raw_cli(args) + res = self.raw_cli(args, extra_env_vars) res.check_returncode() return res @@ -1914,6 +2037,7 @@ class NeonCli(AbstractNeonCli): endpoint_id: str, tenant_id: Optional[TenantId] = None, pageserver_id: Optional[int] = None, + safekeepers: Optional[List[int]] = None, check_return_code=True, ) -> "subprocess.CompletedProcess[str]": args = ["endpoint", "reconfigure", endpoint_id] @@ -1921,6 +2045,8 @@ class NeonCli(AbstractNeonCli): args.extend(["--tenant-id", str(tenant_id)]) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if safekeepers is not None: + args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) return self.raw_cli(args, check_return_code=check_return_code) def endpoint_stop( @@ -2069,17 +2195,47 @@ class StorageControllerApiException(Exception): self.status_code = status_code +# See libs/pageserver_api/src/controller_api.rs +# for the rust definitions of the enums below +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class PageserverAvailability(str, Enum): + ACTIVE = "Active" + UNAVAILABLE = "Unavailable" + OFFLINE = "Offline" + + +class PageserverSchedulingPolicy(str, Enum): + ACTIVE = "Active" + DRAINING = "Draining" + FILLING = "Filling" + PAUSE = "Pause" + PAUSE_FOR_RESTART = "PauseForRestart" + + +class StorageControllerLeadershipStatus(str, Enum): + LEADER = "leader" + STEPPED_DOWN = "stepped_down" + CANDIDATE = "candidate" + + class NeonStorageController(MetricsGetter, LogUtils): - def __init__(self, env: NeonEnv, auth_enabled: bool): + def __init__(self, env: NeonEnv, port: int, auth_enabled: bool): self.env = env + self.port: int = port + self.api: str = f"http://127.0.0.1:{port}" self.running = False self.auth_enabled = auth_enabled self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS - self.logfile = self.workdir / "storage_controller.log" + self.logfile = self.env.repo_dir / "storage_controller_1" / "storage_controller.log" - def start(self): + def start( + self, + timeout_in_seconds: Optional[int] = None, + instance_id: Optional[int] = None, + base_port: Optional[int] = None, + ): assert not self.running - self.env.neon_cli.storage_controller_start() + self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) self.running = True return self @@ -2089,6 +2245,27 @@ class NeonStorageController(MetricsGetter, LogUtils): self.running = False return self + def upcall_api_endpoint(self) -> str: + return f"{self.api}/upcall/v1" + + def api_root(self) -> str: + return self.api + + @staticmethod + def retryable_node_operation(op, ps_id, max_attempts, backoff): + while max_attempts > 0: + try: + op(ps_id) + return + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Operation failed ({max_attempts} attempts left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + @staticmethod def raise_api_exception(res: requests.Response): try: @@ -2102,7 +2279,9 @@ class NeonStorageController(MetricsGetter, LogUtils): def assert_no_errors(self): assert_no_errors( - self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors + self.logfile, + "storage_controller", + self.allowed_errors, ) def pageserver_api(self) -> PageserverHttpClient: @@ -2114,7 +2293,7 @@ class NeonStorageController(MetricsGetter, LogUtils): auth_token = None if self.auth_enabled: auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API) - return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token) + return PageserverHttpClient(self.port, lambda: True, auth_token) def request(self, method, *args, **kwargs) -> requests.Response: resp = requests.request(method, *args, **kwargs) @@ -2131,13 +2310,13 @@ class NeonStorageController(MetricsGetter, LogUtils): return headers def get_metrics(self) -> Metrics: - res = self.request("GET", f"{self.env.storage_controller_api}/metrics") + res = self.request("GET", f"{self.api}/metrics") return parse_metrics(res.text) def ready(self) -> bool: status = None try: - resp = self.request("GET", f"{self.env.storage_controller_api}/ready") + resp = self.request("GET", f"{self.api}/ready") status = resp.status_code except StorageControllerApiException as e: status = e.status_code @@ -2170,7 +2349,7 @@ class NeonStorageController(MetricsGetter, LogUtils): response = self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/attach-hook", + f"{self.api}/debug/v1/attach-hook", json=body, headers=self.headers(TokenScope.ADMIN), ) @@ -2181,7 +2360,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]): self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/attach-hook", + f"{self.api}/debug/v1/attach-hook", json={"tenant_shard_id": str(tenant_shard_id), "node_id": None}, headers=self.headers(TokenScope.ADMIN), ) @@ -2192,7 +2371,7 @@ class NeonStorageController(MetricsGetter, LogUtils): """ response = self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/inspect", + f"{self.api}/debug/v1/inspect", json={"tenant_shard_id": str(tenant_shard_id)}, headers=self.headers(TokenScope.ADMIN), ) @@ -2215,16 +2394,32 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"node_register({body})") self.request( "POST", - f"{self.env.storage_controller_api}/control/v1/node", + f"{self.api}/control/v1/node", json=body, headers=self.headers(TokenScope.ADMIN), ) + def node_delete(self, node_id): + log.info(f"node_delete({node_id})") + self.request( + "DELETE", + f"{self.api}/control/v1/node/{node_id}", + headers=self.headers(TokenScope.ADMIN), + ) + def node_drain(self, node_id): log.info(f"node_drain({node_id})") self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain", + f"{self.api}/control/v1/node/{node_id}/drain", + headers=self.headers(TokenScope.ADMIN), + ) + + def cancel_node_drain(self, node_id): + log.info(f"cancel_node_drain({node_id})") + self.request( + "DELETE", + f"{self.api}/control/v1/node/{node_id}/drain", headers=self.headers(TokenScope.ADMIN), ) @@ -2232,14 +2427,30 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"node_fill({node_id})") self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill", + f"{self.api}/control/v1/node/{node_id}/fill", + headers=self.headers(TokenScope.ADMIN), + ) + + def cancel_node_fill(self, node_id): + log.info(f"cancel_node_fill({node_id})") + self.request( + "DELETE", + f"{self.api}/control/v1/node/{node_id}/fill", headers=self.headers(TokenScope.ADMIN), ) def node_status(self, node_id): response = self.request( "GET", - f"{self.env.storage_controller_api}/control/v1/node/{node_id}", + f"{self.api}/control/v1/node/{node_id}", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def get_leader(self): + response = self.request( + "GET", + f"{self.api}/control/v1/leader", headers=self.headers(TokenScope.ADMIN), ) return response.json() @@ -2247,7 +2458,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def node_list(self): response = self.request( "GET", - f"{self.env.storage_controller_api}/control/v1/node", + f"{self.api}/control/v1/node", headers=self.headers(TokenScope.ADMIN), ) return response.json() @@ -2255,7 +2466,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def tenant_list(self): response = self.request( "GET", - f"{self.env.storage_controller_api}/debug/v1/tenant", + f"{self.api}/debug/v1/tenant", headers=self.headers(TokenScope.ADMIN), ) return response.json() @@ -2265,7 +2476,7 @@ class NeonStorageController(MetricsGetter, LogUtils): body["node_id"] = node_id self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config", + f"{self.api}/control/v1/node/{node_id}/config", json=body, headers=self.headers(TokenScope.ADMIN), ) @@ -2300,7 +2511,7 @@ class NeonStorageController(MetricsGetter, LogUtils): response = self.request( "POST", - f"{self.env.storage_controller_api}/v1/tenant", + f"{self.api}/v1/tenant", json=body, headers=self.headers(TokenScope.PAGE_SERVER_API), ) @@ -2309,11 +2520,11 @@ class NeonStorageController(MetricsGetter, LogUtils): def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ - :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int} + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} """ response = self.request( "GET", - f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate", + f"{self.api}/debug/v1/tenant/{tenant_id}/locate", headers=self.headers(TokenScope.ADMIN), ) body = response.json() @@ -2326,7 +2537,7 @@ class NeonStorageController(MetricsGetter, LogUtils): """ response = self.request( "GET", - f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}", + f"{self.api}/control/v1/tenant/{tenant_id}", headers=self.headers(TokenScope.ADMIN), ) response.raise_for_status() @@ -2337,7 +2548,7 @@ class NeonStorageController(MetricsGetter, LogUtils): ) -> list[TenantShardId]: response = self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split", + f"{self.api}/control/v1/tenant/{tenant_id}/shard_split", json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size}, headers=self.headers(TokenScope.ADMIN), ) @@ -2349,7 +2560,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int): self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate", + f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate", json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id}, headers=self.headers(TokenScope.ADMIN), ) @@ -2360,7 +2571,7 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"tenant_policy_update({tenant_id}, {body})") self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy", + f"{self.api}/control/v1/tenant/{tenant_id}/policy", json=body, headers=self.headers(TokenScope.ADMIN), ) @@ -2368,14 +2579,14 @@ class NeonStorageController(MetricsGetter, LogUtils): def tenant_import(self, tenant_id: TenantId): self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import", + f"{self.api}/debug/v1/tenant/{tenant_id}/import", headers=self.headers(TokenScope.ADMIN), ) def reconcile_all(self): r = self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/reconcile_all", + f"{self.api}/debug/v1/reconcile_all", headers=self.headers(TokenScope.ADMIN), ) r.raise_for_status() @@ -2408,11 +2619,127 @@ class NeonStorageController(MetricsGetter, LogUtils): """ self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/consistency_check", + f"{self.api}/debug/v1/consistency_check", headers=self.headers(TokenScope.ADMIN), ) log.info("storage controller passed consistency check") + def node_registered(self, node_id: int) -> bool: + """ + Returns true if the storage controller can confirm + it knows of pageserver with 'node_id' + """ + try: + self.node_status(node_id) + except StorageControllerApiException as e: + if e.status_code == 404: + return False + else: + raise e + + return True + + def poll_node_status( + self, + node_id: int, + desired_availability: Optional[PageserverAvailability], + desired_scheduling_policy: Optional[PageserverSchedulingPolicy], + max_attempts: int, + backoff: int, + ): + """ + Poll the node status until it reaches 'desired_scheduling_policy' and 'desired_availability' + or 'max_attempts' have been exhausted + """ + log.info( + f"Polling {node_id} for {desired_scheduling_policy} scheduling policy and {desired_availability} availability" + ) + while max_attempts > 0: + try: + status = self.node_status(node_id) + policy = status["scheduling"] + availability = status["availability"] + if (desired_scheduling_policy is None or policy == desired_scheduling_policy) and ( + desired_availability is None or availability == desired_availability + ): + return + else: + max_attempts -= 1 + log.info( + f"Status call returned {policy=} {availability=} ({max_attempts} attempts left)" + ) + + if max_attempts == 0: + raise AssertionError( + f"Status for {node_id=} did not reach {desired_scheduling_policy=} {desired_availability=}" + ) + + time.sleep(backoff) + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Status call failed ({max_attempts} retries left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + + def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]): + body: Dict[str, Any] = { + "healthy_tenant_shards": [str(t) for t in healthy], + "unhealthy_tenant_shards": [str(t) for t in unhealthy], + } + + self.request( + "POST", + f"{self.api}/control/v1/metadata_health/update", + json=body, + headers=self.headers(TokenScope.SCRUBBER), + ) + + def metadata_health_list_unhealthy(self): + response = self.request( + "GET", + f"{self.api}/control/v1/metadata_health/unhealthy", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def metadata_health_list_outdated(self, duration: str): + body: Dict[str, Any] = {"not_scrubbed_for": duration} + + response = self.request( + "POST", + f"{self.api}/control/v1/metadata_health/outdated", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool: + """Metadata is healthy if there is no unhealthy or outdated health records.""" + + unhealthy = self.metadata_health_list_unhealthy() + outdated = self.metadata_health_list_outdated(outdated_duration) + + healthy = ( + len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0 + ) + if not healthy: + log.info(f"{unhealthy=}, {outdated=}") + return healthy + + def step_down(self): + log.info("Asking storage controller to step down") + response = self.request( + "PUT", + f"{self.api}/control/v1/step_down", + headers=self.headers(TokenScope.ADMIN), + ) + + response.raise_for_status() + return response.json() + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): if isinstance(config_strings, tuple): pairs = [config_strings] @@ -2423,16 +2750,91 @@ class NeonStorageController(MetricsGetter, LogUtils): res = self.request( "PUT", - f"{self.env.storage_controller_api}/debug/v1/failpoints", + f"{self.api}/debug/v1/failpoints", json=[{"name": name, "actions": actions} for name, actions in pairs], headers=self.headers(TokenScope.ADMIN), ) log.info(f"Got failpoints request response code {res.status_code}") res.raise_for_status() - @property - def workdir(self) -> Path: - return self.env.repo_dir + def get_tenants_placement(self) -> defaultdict[str, Dict[str, Any]]: + """ + Get the intent and observed placements of all tenants known to the storage controller. + """ + tenants = self.tenant_list() + + tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict( + lambda: { + "observed": {"attached": None, "secondary": []}, + "intent": {"attached": None, "secondary": []}, + } + ) + + for t in tenants: + for node_id, loc_state in t["observed"]["locations"].items(): + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] + in set(["AttachedSingle", "AttachedMulti", "AttachedStale"]) + ): + tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id) + + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] == "Secondary" + ): + tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append( + int(node_id) + ) + + if "attached" in t["intent"]: + tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"][ + "attached" + ] + + if "secondary" in t["intent"]: + tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][ + "secondary" + ] + + return tenant_placement + + def warm_up_all_secondaries(self): + log.info("Warming up all secondary locations") + + tenant_placement = self.get_tenants_placement() + for tid, placement in tenant_placement.items(): + assert placement["observed"]["attached"] is not None + primary_id = placement["observed"]["attached"] + + assert len(placement["observed"]["secondary"]) == 1 + secondary_id = placement["observed"]["secondary"][0] + + parsed_tid = TenantShardId.parse(tid) + self.env.get_pageserver(primary_id).http_client().tenant_heatmap_upload(parsed_tid) + self.env.get_pageserver(secondary_id).http_client().tenant_secondary_download( + parsed_tid, wait_ms=250 + ) + + def get_leadership_status(self) -> StorageControllerLeadershipStatus: + metric_values = {} + for status in StorageControllerLeadershipStatus: + metric_value = self.get_metric_value( + "storage_controller_leadership_status", filter={"status": status} + ) + metric_values[status] = metric_value + + assert list(metric_values.values()).count(1) == 1 + + for status, metric_value in metric_values.items(): + if metric_value == 1: + return status + + raise AssertionError("unreachable") def __enter__(self) -> "NeonStorageController": return self @@ -2446,6 +2848,59 @@ class NeonStorageController(MetricsGetter, LogUtils): self.stop(immediate=True) +class NeonProxiedStorageController(NeonStorageController): + def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool): + super(NeonProxiedStorageController, self).__init__(env, proxy_port, auth_enabled) + self.instances: dict[int, dict[str, Any]] = {} + + def start( + self, + timeout_in_seconds: Optional[int] = None, + instance_id: Optional[int] = None, + base_port: Optional[int] = None, + ): + assert instance_id is not None and base_port is not None + + self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) + self.instances[instance_id] = {"running": True} + + self.running = True + return self + + def stop_instance( + self, immediate: bool = False, instance_id: Optional[int] = None + ) -> "NeonStorageController": + assert instance_id in self.instances + if self.instances[instance_id]["running"]: + self.env.neon_cli.storage_controller_stop(immediate, instance_id) + self.instances[instance_id]["running"] = False + + self.running = any(meta["running"] for meta in self.instances.values()) + return self + + def stop(self, immediate: bool = False) -> "NeonStorageController": + for iid, details in self.instances.items(): + if details["running"]: + self.env.neon_cli.storage_controller_stop(immediate, iid) + self.instances[iid]["running"] = False + + self.running = False + return self + + def assert_no_errors(self): + for instance_id in self.instances.keys(): + assert_no_errors( + self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log", + "storage_controller", + self.allowed_errors, + ) + + def log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Optional[Tuple[str, LogCursor]]: + raise NotImplementedError() + + @dataclass class LogCursor: _line_no: int @@ -2531,6 +2986,7 @@ class NeonPageserver(PgProtocol, LogUtils): def start( self, extra_env_vars: Optional[Dict[str, str]] = None, + timeout_in_seconds: Optional[int] = None, ) -> "NeonPageserver": """ Start the page server. @@ -2539,8 +2995,18 @@ class NeonPageserver(PgProtocol, LogUtils): """ assert self.running is False - self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars) + self.env.neon_cli.pageserver_start( + self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds + ) self.running = True + + if self.env.storage_controller.running and self.env.storage_controller.node_registered( + self.id + ): + self.env.storage_controller.poll_node_status( + self.id, PageserverAvailability.ACTIVE, None, max_attempts=20, backoff=1 + ) + return self def stop(self, immediate: bool = False) -> "NeonPageserver": @@ -2553,13 +3019,17 @@ class NeonPageserver(PgProtocol, LogUtils): self.running = False return self - def restart(self, immediate: bool = False): + def restart( + self, + immediate: bool = False, + timeout_in_seconds: Optional[int] = None, + ): """ High level wrapper for restart: restarts the process, and waits for tenant state to stabilize. """ self.stop(immediate=immediate) - self.start() + self.start(timeout_in_seconds=timeout_in_seconds) self.quiesce_tenants() def quiesce_tenants(self): @@ -2640,7 +3110,6 @@ class NeonPageserver(PgProtocol, LogUtils): self, tenant_id: TenantId, config: None | Dict[str, Any] = None, - config_null: bool = False, generation: Optional[int] = None, override_storage_controller_generation: bool = False, ): @@ -2657,9 +3126,8 @@ class NeonPageserver(PgProtocol, LogUtils): ) return client.tenant_attach( tenant_id, + generation, config, - config_null, - generation=generation, ) def tenant_detach(self, tenant_id: TenantId): @@ -2698,13 +3166,19 @@ class NeonPageserver(PgProtocol, LogUtils): if generation is None: generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) client = self.http_client(auth_token=auth_token) - return client.tenant_create(tenant_id, conf, generation=generation) - def tenant_load(self, tenant_id: TenantId): - client = self.http_client() - return client.tenant_load( - tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id) + conf = conf or {} + + client.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "generation": generation, + "tenant_conf": conf, + "secondary_conf": None, + }, ) + return tenant_id def list_layers( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId @@ -2762,14 +3236,21 @@ class PgBin: env.update(env_add) return env - def run( + def _log_env(self, env: dict[str, str]) -> None: + env_s = {} + for k, v in env.items(): + if k.startswith("PG") and k != "PGPASSWORD": + env_s[k] = v + log.debug(f"Environment: {env_s}") + + def run_nonblocking( self, command: List[str], env: Optional[Env] = None, cwd: Optional[Union[str, Path]] = None, - ): + ) -> subprocess.Popen[Any]: """ - Run one of the postgres binaries. + Run one of the postgres binaries, not waiting for it to finish The command should be in list form, e.g. ['pgbench', '-p', '55432'] @@ -2780,11 +3261,34 @@ class PgBin: If you want stdout/stderr captured to files, use `run_capture` instead. """ - self._fixpath(command) log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) - subprocess.run(command, env=env, cwd=cwd, check=True) + self._log_env(env) + return subprocess.Popen(command, env=env, cwd=cwd, stdout=subprocess.PIPE, text=True) + + def run( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[Union[str, Path]] = None, + ) -> None: + """ + Run one of the postgres binaries, waiting for it to finish + + The command should be in list form, e.g. ['pgbench', '-p', '55432'] + + All the necessary environment variables will be set. + + If the first argument (the command name) doesn't include a path (no '/' + characters present), then it will be edited to include the correct path. + + If you want stdout/stderr captured to files, use `run_capture` instead. + """ + proc = self.run_nonblocking(command, env, cwd) + proc.wait() + if proc.returncode != 0: + raise subprocess.CalledProcessError(proc.returncode, proc.args) def run_capture( self, @@ -2804,6 +3308,7 @@ class PgBin: self._fixpath(command) log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) + self._log_env(env) base_path, _, _ = subprocess_capture( self.log_dir, command, @@ -2993,6 +3498,18 @@ class RemotePostgres(PgProtocol): pass +@pytest.fixture(scope="function") +def benchmark_project_pub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint: + project_id = os.getenv("BENCHMARK_PROJECT_ID_PUB") + return NeonApiEndpoint(neon_api, pg_version, project_id) + + +@pytest.fixture(scope="function") +def benchmark_project_sub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint: + project_id = os.getenv("BENCHMARK_PROJECT_ID_SUB") + return NeonApiEndpoint(neon_api, pg_version, project_id) + + @pytest.fixture(scope="function") def remote_pg( test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion @@ -3039,9 +3556,16 @@ class PSQL: host: str = "127.0.0.1", port: int = 5432, ): - assert shutil.which(path) + search_path = None + if (d := os.getenv("POSTGRES_DISTRIB_DIR")) is not None and ( + v := os.getenv("DEFAULT_PG_VERSION") + ) is not None: + search_path = Path(d) / f"v{v}" / "bin" - self.path = path + full_path = shutil.which(path, path=search_path) + assert full_path is not None + + self.path = full_path self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name" async def run(self, query: Optional[str] = None) -> asyncio.subprocess.Process: @@ -3435,7 +3959,6 @@ class Endpoint(PgProtocol, LogUtils): ): super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres") self.env = env - self.running = False self.branch_name: Optional[str] = None # dubious self.endpoint_id: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA @@ -3443,14 +3966,16 @@ class Endpoint(PgProtocol, LogUtils): self.pg_port = pg_port self.http_port = http_port self.check_stop_result = check_stop_result + # passed to endpoint create and endpoint reconfigure self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers)) # path to conf is /endpoints//pgdata/postgresql.conf - # This lock prevents concurrent start & stop operations, keeping `self.running` consistent - # with whether we're really running. Tests generally wouldn't try and do these concurrently, - # but endpoints are also stopped during test teardown, which might happen concurrently with - # destruction of objects in tests. - self.lock = threading.Lock() + # Semaphore is set to 1 when we start, and acquire'd back to zero when we stop + # + # We use a semaphore rather than a bool so that racing calls to stop() don't + # try and stop the same process twice, as stop() is called by test teardown and + # potentially by some __del__ chains in other threads. + self._running = threading.Semaphore(0) def http_client( self, auth_token: Optional[str] = None, retries: Optional[Retry] = None @@ -3502,7 +4027,6 @@ class Endpoint(PgProtocol, LogUtils): # and make tests more stable. config_lines = ["max_replication_write_lag=15MB"] + config_lines - config_lines = ["neon.primary_is_running=on"] + config_lines self.config(config_lines) return self @@ -3511,7 +4035,9 @@ class Endpoint(PgProtocol, LogUtils): self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + safekeepers: Optional[List[int]] = None, allow_multiple: bool = False, + basebackup_request_tries: Optional[int] = None, ) -> "Endpoint": """ Start the Postgres instance. @@ -3520,17 +4046,22 @@ class Endpoint(PgProtocol, LogUtils): assert self.endpoint_id is not None + # If `safekeepers` is not None, they are remember them as active and use + # in the following commands. + if safekeepers is not None: + self.active_safekeepers = safekeepers + log.info(f"Starting postgres endpoint {self.endpoint_id}") - with self.lock: - self.env.neon_cli.endpoint_start( - self.endpoint_id, - safekeepers=self.active_safekeepers, - remote_ext_config=remote_ext_config, - pageserver_id=pageserver_id, - allow_multiple=allow_multiple, - ) - self.running = True + self.env.neon_cli.endpoint_start( + self.endpoint_id, + safekeepers=self.active_safekeepers, + remote_ext_config=remote_ext_config, + pageserver_id=pageserver_id, + allow_multiple=allow_multiple, + basebackup_request_tries=basebackup_request_tries, + ) + self._running.release(1) return self @@ -3578,19 +4109,30 @@ class Endpoint(PgProtocol, LogUtils): conf_file.write("\n".join(hba) + "\n") conf_file.write(data) - if self.running: + if self.is_running(): self.safe_psql("SELECT pg_reload_conf()") - def reconfigure(self, pageserver_id: Optional[int] = None): - assert self.endpoint_id is not None - self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id) + def is_running(self): + return self._running._value > 0 - def respec(self, **kwargs): + def reconfigure( + self, pageserver_id: Optional[int] = None, safekeepers: Optional[List[int]] = None + ): + assert self.endpoint_id is not None + # If `safekeepers` is not None, they are remember them as active and use + # in the following commands. + if safekeepers is not None: + self.active_safekeepers = safekeepers + self.env.neon_cli.endpoint_reconfigure( + self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers + ) + + def respec(self, **kwargs: Any) -> None: """Update the endpoint.json file used by control_plane.""" # Read config config_path = os.path.join(self.endpoint_path(), "endpoint.json") with open(config_path, "r") as f: - data_dict = json.load(f) + data_dict: dict[str, Any] = json.load(f) # Write it back updated with open(config_path, "w") as file: @@ -3598,13 +4140,13 @@ class Endpoint(PgProtocol, LogUtils): json.dump(dict(data_dict, **kwargs), file, indent=4) # Please note: Migrations only run if pg_skip_catalog_updates is false - def wait_for_migrations(self): + def wait_for_migrations(self, num_migrations: int = 10): with self.cursor() as cur: def check_migrations_done(): cur.execute("SELECT id FROM neon_migration.migration_id") - migration_id = cur.fetchall()[0][0] - assert migration_id != 0 + migration_id: int = cur.fetchall()[0][0] + assert migration_id >= num_migrations wait_until(20, 0.5, check_migrations_done) @@ -3629,13 +4171,12 @@ class Endpoint(PgProtocol, LogUtils): Returns self. """ - with self.lock: - if self.running: - assert self.endpoint_id is not None - self.env.neon_cli.endpoint_stop( - self.endpoint_id, check_return_code=self.check_stop_result, mode=mode - ) - self.running = False + running = self._running.acquire(blocking=False) + if running: + assert self.endpoint_id is not None + self.env.neon_cli.endpoint_stop( + self.endpoint_id, check_return_code=self.check_stop_result, mode=mode + ) return self @@ -3645,13 +4186,13 @@ class Endpoint(PgProtocol, LogUtils): Returns self. """ - with self.lock: + running = self._running.acquire(blocking=False) + if running: assert self.endpoint_id is not None self.env.neon_cli.endpoint_stop( self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode ) self.endpoint_id = None - self.running = False return self @@ -3665,6 +4206,7 @@ class Endpoint(PgProtocol, LogUtils): remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, allow_multiple=False, + basebackup_request_tries: Optional[int] = None, ) -> "Endpoint": """ Create an endpoint, apply config, and start Postgres. @@ -3685,6 +4227,7 @@ class Endpoint(PgProtocol, LogUtils): remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, allow_multiple=allow_multiple, + basebackup_request_tries=basebackup_request_tries, ) log.info(f"Postgres startup took {time.time() - started_at} seconds") @@ -3709,6 +4252,17 @@ class Endpoint(PgProtocol, LogUtils): assert self.pgdata_dir is not None # please mypy return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024 + def clear_shared_buffers(self, cursor: Optional[Any] = None): + """ + Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.' + + Might also clear LFC. + """ + if cursor is not None: + cursor.execute("select clear_buffer_cache()") + else: + self.safe_psql("select clear_buffer_cache()") + class EndpointFactory: """An object representing multiple compute endpoints.""" @@ -3728,6 +4282,7 @@ class EndpointFactory: config_lines: Optional[List[str]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + basebackup_request_tries: Optional[int] = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -3746,6 +4301,7 @@ class EndpointFactory: lsn=lsn, remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, + basebackup_request_tries=basebackup_request_tries, ) def create( @@ -3779,13 +4335,23 @@ class EndpointFactory: pageserver_id=pageserver_id, ) - def stop_all(self) -> "EndpointFactory": + def stop_all(self, fail_on_error=True) -> "EndpointFactory": + exception = None for ep in self.endpoints: - ep.stop() + try: + ep.stop() + except Exception as e: + log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}") + exception = e + + if fail_on_error and exception is not None: + raise exception return self - def new_replica(self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]]): + def new_replica( + self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]] = None + ): branch_name = origin.branch_name assert origin in self.endpoints assert branch_name is not None @@ -3832,16 +4398,48 @@ class Safekeeper(LogUtils): id: int running: bool = False - def __init__(self, env: NeonEnv, port: SafekeeperPort, id: int, running: bool = False): + def __init__( + self, + env: NeonEnv, + port: SafekeeperPort, + id: int, + running: bool = False, + extra_opts: Optional[List[str]] = None, + ): self.env = env self.port = port self.id = id self.running = running self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log" - def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper": + if extra_opts is None: + # Testing defaults: enable everything, and set short timeouts so that background + # work will happen during short tests. + # **Note**: Any test that explicitly sets extra_opts will not get these defaults. + extra_opts = [ + "--enable-offload", + "--delete-offloaded-wal", + "--partial-backup-timeout", + "10s", + "--control-file-save-interval", + "1s", + "--eviction-min-resident", + "10s", + ] + + self.extra_opts = extra_opts + + def start( + self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None + ) -> "Safekeeper": + if extra_opts is None: + # Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two. + extra_opts = self.extra_opts + assert self.running is False - self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts) + self.env.neon_cli.safekeeper_start( + self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds + ) self.running = True # wait for wal acceptor start by checking its status started_at = time.time() @@ -3868,6 +4466,8 @@ class Safekeeper(LogUtils): def assert_no_errors(self): assert not self.log_contains("manager task finished prematurely") + assert not self.log_contains("error while acquiring WalResidentTimeline guard") + assert not self.log_contains("timeout while acquiring WalResidentTimeline guard") def append_logical_message( self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any] @@ -3943,14 +4543,32 @@ class Safekeeper(LogUtils): def timeline_dir(self, tenant_id, timeline_id) -> Path: return self.data_dir / str(tenant_id) / str(timeline_id) + def list_uploaded_segments(self, tenant_id: TenantId, timeline_id: TimelineId): + tline_path = ( + self.env.repo_dir + / "local_fs_remote_storage" + / "safekeeper" + / str(tenant_id) + / str(timeline_id) + ) + assert isinstance(self.env.safekeepers_remote_storage, LocalFsStorage) + return self._list_segments_in_dir( + tline_path, lambda name: ".metadata" not in name and ".___temp" not in name + ) + def list_segments(self, tenant_id, timeline_id) -> List[str]: """ Get list of segment names of the given timeline. """ tli_dir = self.timeline_dir(tenant_id, timeline_id) + return self._list_segments_in_dir( + tli_dir, lambda name: not name.startswith("safekeeper.control") + ) + + def _list_segments_in_dir(self, path: Path, keep_filter: Callable[[str], bool]) -> list[str]: segments = [] - for _, _, filenames in os.walk(tli_dir): - segments.extend([f for f in filenames if not f.startswith("safekeeper.control")]) + for _, _, filenames in os.walk(path): + segments.extend([f for f in filenames if keep_filter(f)]) segments.sort() return segments @@ -3998,9 +4616,9 @@ class Safekeeper(LogUtils): class StorageScrubber: - def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None): + def __init__(self, env: NeonEnv, log_dir: Path): self.env = env - self.log_dir = log_dir or env.test_output_dir + self.log_dir = log_dir def scrubber_cli(self, args: list[str], timeout) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) @@ -4017,11 +4635,15 @@ class StorageScrubber: if s3_storage.endpoint is not None: env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint}) - base_args = [str(self.env.neon_binpath / "storage_scrubber")] + base_args = [ + str(self.env.neon_binpath / "storage_scrubber"), + f"--controller-api={self.env.storage_controller.api_root()}", + ] args = base_args + args + log.info(f"Invoking scrubber command {args} with env: {env}") (output_path, stdout, status_code) = subprocess_capture( - self.env.test_output_dir, + self.log_dir, args, echo_stderr=True, echo_stdout=True, @@ -4040,13 +4662,19 @@ class StorageScrubber: assert stdout is not None return stdout - def scan_metadata(self) -> Any: - stdout = self.scrubber_cli( - ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30 - ) + def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]: + """ + Returns the health status and the metadata summary. + """ + args = ["scan-metadata", "--node-kind", "pageserver", "--json"] + if post_to_storage_controller: + args.append("--post") + stdout = self.scrubber_cli(args, timeout=30) try: - return json.loads(stdout) + summary = json.loads(stdout) + healthy = not summary["with_errors"] and not summary["with_warnings"] + return healthy, summary except: log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:") log.error(stdout) @@ -4060,7 +4688,10 @@ class StorageScrubber: log.info(f"tenant-snapshot output: {stdout}") def pageserver_physical_gc( - self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None + self, + min_age_secs: int, + tenant_ids: Optional[list[TenantId]] = None, + mode: Optional[str] = None, ): args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"] @@ -4070,6 +4701,9 @@ class StorageScrubber: for tenant_id in tenant_ids: args.extend(["--tenant-id", str(tenant_id)]) + if mode is not None: + args.extend(["--mode", mode]) + stdout = self.scrubber_cli( args, timeout=30, @@ -4161,7 +4795,23 @@ def test_output_dir( yield test_dir - allure_attach_from_dir(test_dir) + # Allure artifacts creation might involve the creation of `.tar.zst` archives, + # which aren't going to be used if Allure results collection is not enabled + # (i.e. --alluredir is not set). + # Skip `allure_attach_from_dir` in this case + if not request.config.getoption("--alluredir"): + return + + preserve_database_files = False + for k, v in request.node.user_properties: + # NB: the neon_env_builder fixture uses this fixture (test_output_dir). + # So, neon_env_builder's cleanup runs before here. + # The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property. + if k == "preserve_database_files": + assert isinstance(v, bool) + preserve_database_files = v + + allure_attach_from_dir(test_dir, preserve_database_files) class FileAndThreadLock: @@ -4427,7 +5077,7 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -> Lsn: +def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> Lsn: """Wait logical replication subscriber to sync with publisher.""" publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) while True: @@ -4611,6 +5261,70 @@ def fork_at_current_lsn( return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) +def import_timeline_from_vanilla_postgres( + test_output_dir: Path, + env: NeonEnv, + pg_bin: PgBin, + tenant_id: TenantId, + timeline_id: TimelineId, + branch_name: str, + vanilla_pg_connstr: str, +): + """ + Create a new timeline, by importing an existing PostgreSQL cluster. + + This works by taking a physical backup of the running PostgreSQL cluster, and importing that. + """ + + # Take backup of the existing PostgreSQL server with pg_basebackup + basebackup_dir = os.path.join(test_output_dir, "basebackup") + base_tar = os.path.join(basebackup_dir, "base.tar") + wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") + os.mkdir(basebackup_dir) + pg_bin.run( + [ + "pg_basebackup", + "-F", + "tar", + "-d", + vanilla_pg_connstr, + "-D", + basebackup_dir, + ] + ) + + # Extract start_lsn and end_lsn form the backup manifest file + with open(os.path.join(basebackup_dir, "backup_manifest")) as f: + manifest = json.load(f) + start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] + end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] + + # Import the backup tarballs into the pageserver + env.neon_cli.raw_cli( + [ + "timeline", + "import", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + "--branch-name", + branch_name, + "--base-lsn", + start_lsn, + "--base-tarfile", + base_tar, + "--end-lsn", + end_lsn, + "--wal-tarfile", + wal_tar, + "--pg-version", + env.pg_version, + ] + ) + wait_for_last_record_lsn(env.pageserver.http_client(), tenant_id, timeline_id, Lsn(end_lsn)) + + def last_flush_lsn_upload( env: NeonEnv, endpoint: Endpoint, @@ -4631,9 +5345,7 @@ def last_flush_lsn_upload( for tenant_shard_id, pageserver in shards: ps_http = pageserver.http_client(auth_token=auth_token) wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn) - # force a checkpoint to trigger upload - ps_http.timeline_checkpoint(tenant_shard_id, timeline_id) - wait_for_upload(ps_http, tenant_shard_id, timeline_id, last_flush_lsn) + ps_http.timeline_checkpoint(tenant_shard_id, timeline_id, wait_until_uploaded=True) return last_flush_lsn @@ -4718,9 +5430,5 @@ def generate_uploads_and_deletions( # ensures that the pageserver is in a fully idle state: there will be no more # background ingest, no more uploads pending, and therefore no non-determinism # in subsequent actions like pageserver restarts. - final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) - ps_http.timeline_checkpoint(tenant_id, timeline_id) - # Finish uploads - wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn) - # Finish all remote writes (including deletions) - wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) + flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 147d5705d3..dff002bd4b 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -102,10 +102,16 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ # failing to connect to them. ".*Call to node.*management API.*failed.*receive body.*", ".*Call to node.*management API.*failed.*ReceiveBody.*", + ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", # Tests run in dev mode ".*Starting in dev mode.*", + # Tests that stop endpoints & use the storage controller's neon_local notification + # mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage + # controller's attempts to notify the endpoint). + ".*reconciler.*neon_local notification hook failed.*", + ".*reconciler.*neon_local error.*", ] diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index d5441bd694..cd4261f1b8 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1,6 +1,5 @@ from __future__ import annotations -import json import time from collections import defaultdict from dataclasses import dataclass @@ -62,6 +61,7 @@ class HistoricLayerInfo: remote: bool # None for image layers, true if pageserver thinks this is an L0 delta layer l0: Optional[bool] + visible: bool @classmethod def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: @@ -80,6 +80,7 @@ class HistoricLayerInfo: lsn_end=d.get("lsn_end"), remote=d["remote"], l0=l0_ness, + visible=d["access_stats"]["visible"], ) @@ -118,6 +119,9 @@ class LayerMapInfo: def image_layers(self) -> List[HistoricLayerInfo]: return [x for x in self.historic_layers if x.kind == "Image"] + def delta_l0_layers(self) -> List[HistoricLayerInfo]: + return [x for x in self.historic_layers if x.kind == "Delta" and x.l0] + def historic_by_name(self) -> Set[str]: return set(x.layer_file_name for x in self.historic_layers) @@ -173,6 +177,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter): if auth_token is not None: self.headers["Authorization"] = f"Bearer {auth_token}" + def without_status_retrying(self) -> PageserverHttpClient: + retries = Retry( + status=0, + connect=5, + read=False, + backoff_factor=0.2, + status_forcelist=[], + allowed_methods=None, + remove_headers_on_redirect=[], + ) + + return PageserverHttpClient( + self.port, self.is_testing_enabled_or_skip, self.auth_token, retries + ) + @property def base_url(self) -> str: return f"http://localhost:{self.port}" @@ -221,71 +240,34 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, list) return res_json - def tenant_create( - self, - new_tenant_id: Union[TenantId, TenantShardId], - conf: Optional[Dict[str, Any]] = None, - generation: Optional[int] = None, - ) -> TenantId: - if conf is not None: - assert "new_tenant_id" not in conf.keys() - - body: Dict[str, Any] = { - "new_tenant_id": str(new_tenant_id), - **(conf or {}), - } - - if generation is not None: - body.update({"generation": generation}) - - res = self.post( - f"http://localhost:{self.port}/v1/tenant", - json=body, - ) - self.verbose_error(res) - if res.status_code == 409: - raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") - new_tenant_id = res.json() - assert isinstance(new_tenant_id, str) - return TenantId(new_tenant_id) - def tenant_attach( self, tenant_id: Union[TenantId, TenantShardId], + generation: int, config: None | Dict[str, Any] = None, - config_null: bool = False, - generation: Optional[int] = None, ): - if config_null: - assert config is None - body: Any = None - else: - # null-config is prohibited by the API - config = config or {} - body = {"config": config} - if generation is not None: - body.update({"generation": generation}) + config = config or {} - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach", - data=json.dumps(body), - headers={"Content-Type": "application/json"}, + return self.tenant_location_conf( + tenant_id, + location_conf={ + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": config, + "generation": generation, + }, ) - self.verbose_error(res) - def tenant_detach(self, tenant_id: TenantId, detach_ignored=False, timeout_secs=None): - params = {} - if detach_ignored: - params["detach_ignored"] = "true" - - kwargs = {} - if timeout_secs is not None: - kwargs["timeout"] = timeout_secs - - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params, **kwargs + def tenant_detach(self, tenant_id: TenantId): + return self.tenant_location_conf( + tenant_id, + location_conf={ + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, ) - self.verbose_error(res) def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool): params = {} @@ -340,17 +322,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) return res - def tenant_load(self, tenant_id: TenantId, generation=None): - body = None - if generation is not None: - body = {"generation": generation} - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body) - self.verbose_error(res) - - def tenant_ignore(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore") - self.verbose_error(res) - def tenant_status( self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False ) -> Dict[Any, Any]: @@ -390,6 +361,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) return (res.status_code, res.json()) + def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]): + url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status" + res = self.get(url) + self.verbose_error(res) + return res.json() + def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]): assert "tenant_id" not in config.keys() res = self.put( @@ -587,6 +564,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json + def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc", + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + + def timeline_unblock_gc( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc", + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + def timeline_compact( self, tenant_id: Union[TenantId, TenantShardId], @@ -594,6 +587,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): force_repartition=False, force_image_layer_creation=False, wait_until_uploaded=False, + enhanced_gc_bottom_most_compaction=False, ): self.is_testing_enabled_or_skip() query = {} @@ -603,6 +597,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): query["force_image_layer_creation"] = "true" if wait_until_uploaded: query["wait_until_uploaded"] = "true" + if enhanced_gc_bottom_most_compaction: + query["enhanced_gc_bottom_most_compaction"] = "true" log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") res = self.put( @@ -630,19 +626,37 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, timestamp: datetime, + with_lease: bool = False, **kwargs, ): log.info( - f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" + f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}, {with_lease=}" ) + with_lease_query = f"{with_lease=}".lower() res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z", + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z&{with_lease_query}", **kwargs, ) self.verbose_error(res) res_json = res.json() return res_json + def timeline_lsn_lease( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn + ): + data = { + "lsn": str(lsn), + } + + log.info(f"Requesting lsn lease for {lsn=}, {tenant_id=}, {timeline_id=}") + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease", + json=data, + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_get_timestamp_of_lsn( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn ): @@ -672,6 +686,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): force_repartition=False, force_image_layer_creation=False, wait_until_uploaded=False, + compact: Optional[bool] = None, + **kwargs, ): self.is_testing_enabled_or_skip() query = {} @@ -682,10 +698,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): if wait_until_uploaded: query["wait_until_uploaded"] = "true" + if compact is not None: + query["compact"] = "true" if compact else "false" + log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", params=query, + **kwargs, ) log.info(f"Got checkpoint request response code: {res.status_code}") self.verbose_error(res) @@ -842,6 +862,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, batch_size: int | None = None, + **kwargs, ) -> Set[TimelineId]: params = {} if batch_size is not None: @@ -849,6 +870,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", params=params, + **kwargs, ) self.verbose_error(res) json = res.json() diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 8730d8ef75..3e0ffabf74 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -1,5 +1,4 @@ import concurrent.futures -import time from typing import Any, Callable, Dict, Tuple import fixtures.pageserver.remote_storage @@ -9,9 +8,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) -from fixtures.pageserver.utils import ( - wait_until_tenant_state, -) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind @@ -42,46 +38,37 @@ def single_timeline( log.info("detach template tenant form pageserver") env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) log.info(f"duplicating template tenant {ncopies} times in S3") tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) + # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. + # However, on-demand downloads are quite slow ATM. + # => do the on-demand downloads in Python. + log.info("python-side on-demand download the layer files into local tenant dir") + tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) + fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( + env, tenant_timelines + ) + log.info("attach duplicated tenants to pageserver") # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. # However, on-demand downloads are quite slow ATM. # => do the on-demand downloads in Python. assert ps_http.tenant_list() == [] - # make the attach fail after it created enough on-disk state to retry loading - # the tenant next startup, but before it can start background loops that would start download - ps_http.configure_failpoints(("attach-before-activate", "return")) - env.pageserver.allowed_errors.append( - ".*attach failed, setting tenant state to Broken: attach-before-activate.*" - ) - def attach_broken(tenant): + def attach(tenant): env.pageserver.tenant_attach( tenant, config=template_config.copy(), generation=100, override_storage_controller_generation=True, ) - time.sleep(0.1) - wait_until_tenant_state(ps_http, tenant, "Broken", 10) with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: - executor.map(attach_broken, tenants) + executor.map(attach, tenants) - env.pageserver.stop( - immediate=True - ) # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout - tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) - log.info("python-side on-demand download the layer files into local tenant dir") - fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( - env, tenant_timelines - ) + # Benchmarks will start the pageserver explicitly themselves + env.pageserver.stop() return env diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 72384c138b..b75a480a63 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -198,7 +198,7 @@ def wait_for_last_record_lsn( lsn: Lsn, ) -> Lsn: """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" - for i in range(100): + for i in range(1000): current_lsn = last_record_lsn(pageserver_http, tenant, timeline) if current_lsn >= lsn: return current_lsn @@ -430,52 +430,6 @@ def enable_remote_storage_versioning( return response -def wait_tenant_status_404( - pageserver_http: PageserverHttpClient, - tenant_id: TenantId, - iterations: int, - interval: float = 0.250, -): - def tenant_is_missing(): - data = {} - try: - data = pageserver_http.tenant_status(tenant_id) - log.info(f"tenant status {data}") - except PageserverApiException as e: - log.debug(e) - if e.status_code == 404: - return - - raise RuntimeError(f"Timeline exists state {data.get('state')}") - - wait_until(iterations, interval=interval, func=tenant_is_missing) - - -def tenant_delete_wait_completed( - pageserver_http: PageserverHttpClient, - tenant_id: TenantId, - iterations: int, - ignore_errors: bool = False, -): - if not ignore_errors: - pageserver_http.tenant_delete(tenant_id=tenant_id) - else: - interval = 0.5 - - def delete_request_sent(): - try: - pageserver_http.tenant_delete(tenant_id=tenant_id) - except PageserverApiException as e: - log.debug(e) - if e.status_code == 404: - return - except Exception as e: - log.debug(e) - - wait_until(iterations, interval=interval, func=delete_request_sent) - wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations) - - MANY_SMALL_LAYERS_TENANT_CONFIG = { "gc_period": "0s", "compaction_period": "0s", diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 0227285822..92c98763e3 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -1,6 +1,7 @@ import os from typing import Any, Dict, Optional +import allure import pytest import toml from _pytest.python import Metafunc @@ -91,3 +92,23 @@ def pytest_generate_tests(metafunc: Metafunc): and (platform := os.getenv("PLATFORM")) is not None ): metafunc.parametrize("platform", [platform.lower()]) + + +@pytest.hookimpl(hookwrapper=True, tryfirst=True) +def pytest_runtest_makereport(*args, **kwargs): + # Add test parameters to Allue report to distinguish the same tests with different parameters. + # Names has `__` prefix to avoid conflicts with `pytest.mark.parametrize` parameters + + # A mapping between `uname -m` and `RUNNER_ARCH` values. + # `RUNNER_ARCH` environment variable is set on GitHub Runners, + # possible values are X86, X64, ARM, or ARM64. + # See https://docs.github.com/en/actions/learn-github-actions/variables#default-environment-variables + uname_m = { + "aarch64": "ARM64", + "arm64": "ARM64", + "x86_64": "X64", + }.get(os.uname().machine, "UNKNOWN") + arch = os.getenv("RUNNER_ARCH", uname_m) + allure.dynamic.parameter("__arch", arch) + + yield diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index 941889a2f5..e12c8e5f4a 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -3,8 +3,6 @@ import os from typing import Optional import pytest -from _pytest.config import Config -from _pytest.config.argparsing import Parser """ This fixture is used to determine which version of Postgres to use for tests. @@ -52,7 +50,7 @@ class PgVersion(str, enum.Enum): return None -DEFAULT_VERSION: PgVersion = PgVersion.V15 +DEFAULT_VERSION: PgVersion = PgVersion.V16 def skip_on_postgres(version: PgVersion, reason: str): @@ -69,22 +67,8 @@ def xfail_on_postgres(version: PgVersion, reason: str): ) -def pytest_addoption(parser: Parser): - parser.addoption( - "--pg-version", - action="store", - type=PgVersion, - help="DEPRECATED: Postgres version to use for tests", - ) - - def run_only_on_default_postgres(reason: str): return pytest.mark.skipif( PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION, reason=reason, ) - - -def pytest_configure(config: Config): - if config.getoption("--pg-version"): - raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead") diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 6f6526d3fc..1b6c3c23ba 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -12,8 +12,9 @@ import boto3 import toml from mypy_boto3_s3 import S3Client -from fixtures.common_types import TenantId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log +from fixtures.pageserver.common_types import IndexPartDump TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @@ -176,9 +177,14 @@ class S3Storage: def access_env_vars(self) -> Dict[str, str]: if self.aws_profile is not None: - return { + env = { "AWS_PROFILE": self.aws_profile, } + # Pass through HOME env var because AWS_PROFILE needs it in order to work + home = os.getenv("HOME") + if home is not None: + env["HOME"] = home + return env if self.access_key is not None and self.secret_key is not None: return { "AWS_ACCESS_KEY_ID": self.access_key, @@ -265,9 +271,38 @@ class S3Storage: def tenants_path(self) -> str: return f"{self.prefix_in_bucket}/tenants" - def tenant_path(self, tenant_id: TenantId) -> str: + def tenant_path(self, tenant_id: Union[TenantShardId, TenantId]) -> str: return f"{self.tenants_path()}/{tenant_id}" + def timeline_path( + self, tenant_id: Union[TenantShardId, TenantId], timeline_id: TimelineId + ) -> str: + return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + + def get_latest_index_key(self, index_keys: List[str]) -> str: + """ + Gets the latest index file key. + + @param index_keys: A list of index keys of different generations. + """ + + def parse_gen(index_key: str) -> int: + parts = index_key.split("index_part.json-") + return int(parts[-1], base=16) if len(parts) == 2 else -1 + + return max(index_keys, key=parse_gen) + + def download_index_part(self, index_key: str) -> IndexPartDump: + """ + Downloads the index content from remote storage. + + @param index_key: index key in remote storage. + """ + response = self.client.get_object(Bucket=self.bucket_name, Key=index_key) + body = response["Body"].read().decode("utf-8") + log.info(f"index_part.json: {body}") + return IndexPartDump.from_json(json.loads(body)) + def heatmap_key(self, tenant_id: TenantId) -> str: return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 11e6fef28f..dd3a0a3d54 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -1,13 +1,13 @@ import json -import re -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Union import pytest import requests -from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId from fixtures.log_helper import log +from fixtures.metrics import Metrics, MetricsGetter, parse_metrics # Walreceiver as returned by sk's timeline status endpoint. @@ -31,15 +31,26 @@ class SafekeeperTimelineStatus: walreceivers: List[Walreceiver] -@dataclass -class SafekeeperMetrics: +class SafekeeperMetrics(Metrics): + # Helpers to get metrics from tests without hardcoding the metric names there. # These are metrics from Prometheus which uses float64 internally. # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + + def __init__(self, m: Metrics): + self.metrics = m.metrics + + def flush_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId): + return self.query_one( + "safekeeper_flush_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)} + ).value + + def commit_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId): + return self.query_one( + "safekeeper_commit_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)} + ).value -class SafekeeperHttpClient(requests.Session): +class SafekeeperHttpClient(requests.Session, MetricsGetter): HTTPError = requests.HTTPError def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): @@ -133,6 +144,12 @@ class SafekeeperHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json + def timeline_list(self) -> List[TenantTimelineId]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/timeline") + res.raise_for_status() + resj = res.json() + return [TenantTimelineId.from_json(ttidj) for ttidj in resj] + def timeline_create( self, tenant_id: TenantId, @@ -209,28 +226,11 @@ class SafekeeperHttpClient(requests.Session): return res_json def get_metrics_str(self) -> str: + """You probably want to use get_metrics() instead.""" request_result = self.get(f"http://localhost:{self.port}/metrics") request_result.raise_for_status() return request_result.text def get_metrics(self) -> SafekeeperMetrics: - all_metrics_text = self.get_metrics_str() - - metrics = SafekeeperMetrics() - for match in re.finditer( - r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( - match.group(3) - ) - for match in re.finditer( - r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.commit_lsn_inexact[ - (TenantId(match.group(1)), TimelineId(match.group(2))) - ] = int(match.group(3)) - return metrics + res = self.get_metrics_str() + return SafekeeperMetrics(parse_metrics(res)) diff --git a/test_runner/fixtures/storage_controller_proxy.py b/test_runner/fixtures/storage_controller_proxy.py new file mode 100644 index 0000000000..3477f8b1f2 --- /dev/null +++ b/test_runner/fixtures/storage_controller_proxy.py @@ -0,0 +1,73 @@ +import re +from typing import Any, Optional + +import pytest +import requests +from pytest_httpserver import HTTPServer +from werkzeug.datastructures import Headers +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + +from fixtures.log_helper import log + + +class StorageControllerProxy: + def __init__(self, server: HTTPServer): + self.server: HTTPServer = server + self.listen: str = f"http://{server.host}:{server.port}" + self.routing_to: Optional[str] = None + + def route_to(self, storage_controller_api: str): + self.routing_to = storage_controller_api + + def port(self) -> int: + return self.server.port + + def upcall_api_endpoint(self) -> str: + return f"{self.listen}/upcall/v1" + + +def proxy_request(method: str, url: str, **kwargs) -> requests.Response: + return requests.request(method, url, **kwargs) + + +@pytest.fixture(scope="function") +def storage_controller_proxy(make_httpserver): + """ + Proxies requests into the storage controller to the currently + selected storage controller instance via `StorageControllerProxy.route_to`. + + This fixture is intended for tests that need to run multiple instances + of the storage controller at the same time. + """ + server = make_httpserver + + self = StorageControllerProxy(server) + + log.info(f"Storage controller proxy listening on {self.listen}") + + def handler(request: Request): + if self.route_to is None: + log.info(f"Storage controller proxy has no routing configured for {request.url}") + return Response("Routing not configured", status=503) + + route_to_url = f"{self.routing_to}{request.path}" + + log.info(f"Routing {request.url} to {route_to_url}") + + args: dict[str, Any] = {"headers": request.headers} + if request.is_json: + args["json"] = request.json + + response = proxy_request(request.method, route_to_url, **args) + + headers = Headers() + for key, value in response.headers.items(): + headers.add(key, value) + + return Response(response.content, headers=headers, status=response.status_code) + + self.server.expect_request(re.compile(".*")).respond_with_handler(handler) + + yield self + server.clear() diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 0989dc1893..80f1c9e4e3 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -240,9 +240,18 @@ ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] ) -def allure_attach_from_dir(dir: Path): +def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False): """Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report""" + if preserve_database_files: + zst_file = dir.with_suffix(".tar.zst") + with zst_file.open("wb") as zst: + cctx = zstandard.ZstdCompressor() + with cctx.stream_writer(zst) as compressor: + with tarfile.open(fileobj=compressor, mode="w") as tar: + tar.add(dir, arcname="") + allure.attach.file(zst_file, "everything.tar.zst", "application/zstd", "tar.zst") + for attachment in Path(dir).glob("**/*"): if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0: name = str(attachment.relative_to(dir)) @@ -380,7 +389,10 @@ WaitUntilRet = TypeVar("WaitUntilRet") def wait_until( - number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet] + number_of_iterations: int, + interval: float, + func: Callable[[], WaitUntilRet], + show_intermediate_error=False, ) -> WaitUntilRet: """ Wait until 'func' returns successfully, without exception. Returns the @@ -391,8 +403,10 @@ def wait_until( try: res = func() except Exception as e: - log.info("waiting for %s iteration %s failed", func, i + 1) + log.info("waiting for %s iteration %s failed: %s", func, i + 1, e) last_exception = e + if show_intermediate_error: + log.info(e) time.sleep(interval) continue return res diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index dfd9caba3e..065a78bf9b 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -10,7 +10,7 @@ from fixtures.neon_fixtures import ( tenant_get_shards, wait_for_last_flush_lsn, ) -from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pageserver.utils import wait_for_last_record_lsn # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex # to ensure we don't do that: this enables running lots of Workloads in parallel safely. @@ -174,22 +174,17 @@ class Workload: if upload: # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload) - ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id) - wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) + ps_http.timeline_checkpoint( + tenant_shard_id, self.timeline_id, wait_until_uploaded=True + ) log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") else: log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") def validate(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) - result = endpoint.safe_psql_many( - [ - "select clear_buffer_cache()", - f""" - SELECT COUNT(*) FROM {self.table} - """, - ] - ) + endpoint.clear_shared_buffers() + result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}") log.info(f"validate({self.expect_rows}): {result}") - assert result == [[("",)], [(self.expect_rows,)]] + assert result == [(self.expect_rows,)] diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md new file mode 100644 index 0000000000..8eca056dda --- /dev/null +++ b/test_runner/logical_repl/README.md @@ -0,0 +1,22 @@ +# Logical replication tests + +## Clickhouse + +```bash +export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb + +docker compose -f clickhouse/docker-compose.yml up -d +pytest -m remote_cluster -k test_clickhouse +docker compose -f clickhouse/docker-compose.yml down +``` + +## Debezium + +```bash +export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb + +docker compose -f debezium/docker-compose.yml up -d +pytest -m remote_cluster -k test_debezium +docker compose -f debezium/docker-compose.yml down + +``` \ No newline at end of file diff --git a/test_runner/logical_repl/clickhouse/docker-compose.yml b/test_runner/logical_repl/clickhouse/docker-compose.yml new file mode 100644 index 0000000000..e00038b811 --- /dev/null +++ b/test_runner/logical_repl/clickhouse/docker-compose.yml @@ -0,0 +1,9 @@ +services: + clickhouse: + image: clickhouse/clickhouse-server + user: "101:101" + container_name: clickhouse + hostname: clickhouse + ports: + - 127.0.0.1:8123:8123 + - 127.0.0.1:9000:9000 diff --git a/test_runner/logical_repl/debezium/docker-compose.yml b/test_runner/logical_repl/debezium/docker-compose.yml new file mode 100644 index 0000000000..fee127a2fd --- /dev/null +++ b/test_runner/logical_repl/debezium/docker-compose.yml @@ -0,0 +1,24 @@ +services: + zookeeper: + image: quay.io/debezium/zookeeper:2.7 + kafka: + image: quay.io/debezium/kafka:2.7 + environment: + ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + ports: + - 127.0.0.1:9092:9092 + debezium: + image: quay.io/debezium/connect:2.7 + environment: + BOOTSTRAP_SERVERS: kafka:9092 + GROUP_ID: 1 + CONFIG_STORAGE_TOPIC: debezium-config + OFFSET_STORAGE_TOPIC: debezium-offset + STATUS_STORAGE_TOPIC: debezium-status + DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector + ports: + - 127.0.0.1:8083:8083 diff --git a/test_runner/logical_repl/test_clickhouse.py b/test_runner/logical_repl/test_clickhouse.py new file mode 100644 index 0000000000..c5ed9bc8af --- /dev/null +++ b/test_runner/logical_repl/test_clickhouse.py @@ -0,0 +1,82 @@ +""" +Test the logical replication in Neon with ClickHouse as a consumer +""" + +import hashlib +import os +import time + +import clickhouse_connect +import psycopg2 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import RemotePostgres +from fixtures.utils import wait_until + + +def query_clickhouse( + client, + query: str, + digest: str, +) -> None: + """ + Run the query on the client + return answer if successful, raise an exception otherwise + """ + log.debug("Query: %s", query) + res = client.query(query) + log.debug(res.result_rows) + m = hashlib.sha1() + m.update(repr(tuple(res.result_rows)).encode()) + hash_res = m.hexdigest() + log.debug("Hash: %s", hash_res) + if hash_res == digest: + return + raise ValueError("Hash mismatch") + + +@pytest.mark.remote_cluster +def test_clickhouse(remote_pg: RemotePostgres): + """ + Test the logical replication having ClickHouse as a client + """ + clickhouse_host = "clickhouse" if ("CI" in os.environ) else "127.0.0.1" + conn_options = remote_pg.conn_options() + conn = psycopg2.connect(remote_pg.connstr()) + cur = conn.cursor() + cur.execute("DROP TABLE IF EXISTS table1") + cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));") + cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');") + conn.commit() + client = clickhouse_connect.get_client(host=clickhouse_host) + client.command("SET allow_experimental_database_materialized_postgresql=1") + client.command( + "CREATE DATABASE db1_postgres ENGINE = " + f"MaterializedPostgreSQL('{conn_options['host']}', " + f"'{conn_options['dbname']}', " + f"'{conn_options['user']}', '{conn_options['password']}') " + "SETTINGS materialized_postgresql_tables_list = 'table1';" + ) + wait_until( + 120, + 0.5, + lambda: query_clickhouse( + client, + "select * from db1_postgres.table1 order by 1", + "ee600d8f7cd05bd0b169fa81f44300a9dd10085a", + ), + ) + cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');") + conn.commit() + wait_until( + 120, + 0.5, + lambda: query_clickhouse( + client, + "select * from db1_postgres.table1 order by 1", + "9eba2daaf7e4d7d27ac849525f68b562ab53947d", + ), + ) + log.debug("Sleeping before final checking if Neon is still alive") + time.sleep(3) + cur.execute("SELECT 1") diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py new file mode 100644 index 0000000000..5426a06ca1 --- /dev/null +++ b/test_runner/logical_repl/test_debezium.py @@ -0,0 +1,190 @@ +""" +Test the logical replication in Neon with Debezium as a consumer +""" + +import json +import os +import time + +import psycopg2 +import pytest +import requests +from fixtures.log_helper import log +from fixtures.neon_fixtures import RemotePostgres +from fixtures.utils import wait_until + + +class DebeziumAPI: + """ + The class for Debezium API calls + """ + + def __init__(self): + self.__host = "debezium" if ("CI" in os.environ) else "127.0.0.1" + self.__base_url = f"http://{self.__host}:8083" + self.__connectors_url = f"{self.__base_url}/connectors" + + def __request(self, method, addurl="", **kwargs): + return requests.request( + method, + self.__connectors_url + addurl, + headers={"Accept": "application/json", "Content-type": "application/json"}, + timeout=60, + **kwargs, + ) + + def create_pg_connector(self, remote_pg: RemotePostgres, dbz_conn_name: str): + """ + Create a Postgres connector in debezium + """ + conn_options = remote_pg.conn_options() + payload = { + "name": dbz_conn_name, + "config": { + "connector.class": "io.debezium.connector.postgresql.PostgresConnector", + "tasks.max": "1", + "database.hostname": conn_options["host"], + "database.port": "5432", + "database.user": conn_options["user"], + "database.password": conn_options["password"], + "database.dbname": conn_options["dbname"], + "plugin.name": "pgoutput", + "topic.prefix": "dbserver1", + "schema.include.list": "inventory", + }, + } + return self.__request("POST", json=payload) + + def list_connectors(self): + """ + Returns a list of all connectors existent in Debezium. + """ + resp = self.__request("GET") + assert resp.ok + return json.loads(resp.text) + + def del_connector(self, connector): + """ + Deletes the specified connector + """ + return self.__request("DELETE", f"/{connector}") + + +@pytest.fixture(scope="function") +def debezium(remote_pg: RemotePostgres): + """ + Prepare the Debezium API handler, connection + """ + conn = psycopg2.connect(remote_pg.connstr()) + cur = conn.cursor() + cur.execute("DROP SCHEMA IF EXISTS inventory CASCADE") + cur.execute("CREATE SCHEMA inventory") + cur.execute( + "CREATE TABLE inventory.customers (" + "id SERIAL NOT NULL PRIMARY KEY," + "first_name character varying(255) NOT NULL," + "last_name character varying(255) NOT NULL," + "email character varying(255) NOT NULL)" + ) + conn.commit() + dbz = DebeziumAPI() + assert len(dbz.list_connectors()) == 0 + dbz_conn_name = "inventory-connector" + resp = dbz.create_pg_connector(remote_pg, dbz_conn_name) + log.debug("%s %s %s", resp.status_code, resp.ok, resp.text) + assert resp.status_code == 201 + assert len(dbz.list_connectors()) == 1 + from kafka import KafkaConsumer + + consumer = KafkaConsumer( + "dbserver1.inventory.customers", + bootstrap_servers=["kafka:9092"], + auto_offset_reset="earliest", + enable_auto_commit=False, + ) + yield conn, consumer + resp = dbz.del_connector(dbz_conn_name) + assert resp.status_code == 204 + + +def get_kafka_msg(consumer, ts_ms, before=None, after=None) -> None: + """ + Gets the message from Kafka and checks its validity + Arguments: + consumer: the consumer object + ts_ms: timestamp in milliseconds of the change of db, the corresponding message must have + the later timestamp + before: a dictionary, if not None, the before field from the kafka message must + have the same values for the same keys + after: a dictionary, if not None, the after field from the kafka message must + have the same values for the same keys + """ + msg = consumer.poll() + assert msg, "Empty message" + for val in msg.values(): + r = json.loads(val[-1].value) + log.info(r["payload"]) + assert ts_ms < r["payload"]["ts_ms"], "Incorrect timestamp" + for param, pname in ((before, "before"), (after, "after")): + if param is not None: + for k, v in param.items(): + assert r["payload"][pname][k] == v, f"{pname} mismatches" + + +@pytest.mark.remote_cluster +def test_debezium(debezium): + """ + Test the logical replication having Debezium as a subscriber + """ + conn, consumer = debezium + cur = conn.cursor() + ts_ms = time.time() * 1000 + log.info("Insert 1 ts_ms: %s", ts_ms) + cur.execute( + "insert into inventory.customers (first_name, last_name, email) " + "values ('John', 'Dow','johndow@example.com')" + ) + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"}, + ), + show_intermediate_error=True, + ) + ts_ms = time.time() * 1000 + log.info("Insert 2 ts_ms: %s", ts_ms) + cur.execute( + "insert into inventory.customers (first_name, last_name, email) " + "values ('Alex', 'Row','alexrow@example.com')" + ) + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"}, + ), + show_intermediate_error=True, + ) + ts_ms = time.time() * 1000 + log.info("Update ts_ms: %s", ts_ms) + cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2") + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "Alexander"}, + ), + show_intermediate_error=True, + ) + time.sleep(3) + cur.execute("select 1") diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 7ad65821d4..70d75a6dcf 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -7,7 +7,7 @@ easier to see if you have compile errors without scrolling up. You may also need to run `./scripts/pysync`. Then run the tests -`DEFAULT_PG_VERSION=15 NEON_BIN=./target/release poetry run pytest test_runner/performance` +`DEFAULT_PG_VERSION=16 NEON_BIN=./target/release poetry run pytest test_runner/performance` Some handy pytest flags for local development: - `-x` tells pytest to stop on first error diff --git a/test_runner/performance/pageserver/README.md b/test_runner/performance/pageserver/README.md index fdd09cd946..56ffad9963 100644 --- a/test_runner/performance/pageserver/README.md +++ b/test_runner/performance/pageserver/README.md @@ -11,6 +11,6 @@ It supports mounting snapshots using overlayfs, which improves iteration time. Here's a full command line. ``` -RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release \ +RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release \ ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py ```` diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py index 0ff9c8fdaa..8d781c1609 100644 --- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py +++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py @@ -14,7 +14,7 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking """ Usage: -DEFAULT_PG_VERSION=15 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \ +DEFAULT_PG_VERSION=16 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \ ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py """ @@ -55,10 +55,6 @@ def setup_env( } template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ep = env.endpoints.create_start("main", tenant_id=template_tenant) ep.safe_psql("create table foo(b text)") diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index b66db4d0ab..8b934057e4 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -17,13 +17,11 @@ from performance.pageserver.util import ( @pytest.mark.parametrize("duration", [30]) @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) @pytest.mark.parametrize("n_tenants", [10]) -@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"]) @pytest.mark.timeout(1000) def test_basebackup_with_high_slru_count( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, - get_vectored_impl: str, n_tenants: int, pgbench_scale: int, duration: int, @@ -46,8 +44,7 @@ def test_basebackup_with_high_slru_count( page_cache_size = 16384 max_file_descriptors = 500000 neon_env_builder.pageserver_config_override = ( - f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; " - f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false" + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" ) params.update( { @@ -86,10 +83,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int): template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ps_http = env.pageserver.http_client() diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py index 644c1f559b..9ad6e7907c 100644 --- a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py +++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py @@ -5,8 +5,12 @@ from typing import Any, Dict, Tuple import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn -from fixtures.pageserver.utils import wait_for_upload_queue_empty +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + flush_ep_to_pageserver, +) from fixtures.remote_storage import s3_storage from fixtures.utils import humantime_to_ms @@ -95,9 +99,9 @@ def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)}) as i", options="-c statement_timeout=0", ) - wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) - # TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here - wait_for_upload_queue_empty(client, tenant_id, timeline_id) + flush_ep_to_pageserver(env, ep, tenant_id, timeline_id) + + client.timeline_checkpoint(tenant_id, timeline_id, compact=False, wait_until_uploaded=True) return env diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 68f3d9dcbe..949813c984 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -1,4 +1,5 @@ import json +import os from pathlib import Path from typing import Any, Dict, Tuple @@ -17,30 +18,74 @@ from performance.pageserver.util import ( setup_pageserver_with_tenants, ) +# The following tests use pagebench "getpage at latest LSN" to characterize the throughput of the pageserver. +# originally there was a single test named `test_pageserver_max_throughput_getpage_at_latest_lsn`` +# so you still see some references to this name in the code. +# To avoid recreating the snapshots for each test, we continue to use the name `max_throughput_latest_lsn` +# for some files and metrics. + # For reference, the space usage of the snapshots: -# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots -# 137G /instance_store/test_output/shared-snapshots -# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/* -# 1.8G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13 -# 1.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6 -# 8.5G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13 -# 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6 -# 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13 -# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6 -@pytest.mark.parametrize("duration", [30]) -@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]]) -@pytest.mark.parametrize("n_tenants", [1, 10]) -@pytest.mark.timeout( - 10000 -) # TODO: this value is just "a really high number"; have this per instance type -def test_pageserver_max_throughput_getpage_at_latest_lsn( +# sudo du -hs /instance_store/neon/test_output/shared-snapshots/* +# 416G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-500-13 +@pytest.mark.parametrize("duration", [60 * 60]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) +@pytest.mark.parametrize("n_tenants", [500]) +@pytest.mark.timeout(10000) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +) +def test_pageserver_characterize_throughput_with_n_tenants( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, n_tenants: int, pgbench_scale: int, duration: int, +): + setup_and_run_pagebench_benchmark( + neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, 1 + ) + + +# For reference, the space usage of the snapshots: +# sudo du -hs /instance_store/neon/test_output/shared-snapshots/* +# 19G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-1-136 +@pytest.mark.parametrize("duration", [20 * 60]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)]) +# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability +# we use 64 clients because typically for a high number of connections we recommend the connection pooler +# which by default uses 64 connections +@pytest.mark.parametrize("n_clients", [1, 64]) +@pytest.mark.parametrize("n_tenants", [1]) +@pytest.mark.timeout(2400) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +) +def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, + n_clients: int, +): + setup_and_run_pagebench_benchmark( + neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients + ) + + +def setup_and_run_pagebench_benchmark( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, + n_clients: int, ): def record(metric, **kwargs): zenbenchmark.record( @@ -55,6 +100,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( "n_tenants": (n_tenants, {"unit": ""}), "pgbench_scale": (pgbench_scale, {"unit": ""}), "duration": (duration, {"unit": "s"}), + "n_clients": (n_clients, {"unit": ""}), } ) @@ -85,6 +131,8 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}", n_tenants, setup_wrapper, + # https://github.com/neondatabase/neon/issues/8070 + timeout_in_seconds=60, ) env.pageserver.allowed_errors.append( @@ -94,7 +142,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" ) - run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration) + run_pagebench_benchmark(env, pg_bin, record, duration, n_clients) def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): @@ -116,10 +164,6 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): } template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ps_http = env.pageserver.http_client() with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: @@ -155,8 +199,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): return (template_tenant, template_timeline, config) -def run_benchmark_max_throughput_latest_lsn( - env: NeonEnv, pg_bin: PgBin, record, duration_secs: int +def run_pagebench_benchmark( + env: NeonEnv, pg_bin: PgBin, record, duration_secs: int, n_clients: int ): """ Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`. @@ -170,6 +214,8 @@ def run_benchmark_max_throughput_latest_lsn( ps_http.base_url, "--page-service-connstring", env.pageserver.connstr(password=None), + "--num-clients", + str(n_clients), "--runtime", f"{duration_secs}s", # don't specify the targets explicitly, let pagebench auto-discover them @@ -209,11 +255,3 @@ def run_benchmark_max_throughput_latest_lsn( unit="ms", report=MetricReport.LOWER_IS_BETTER, ) - - env.storage_controller.allowed_errors.append( - # The test setup swaps NeonEnv instances, hence different - # pg instances are used for the storage controller db. This means - # the storage controller doesn't know about the nodes mentioned - # in attachments.json at start-up. - ".* Scheduler missing node 1", - ) diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index f31cd9a9f8..88296a7fbd 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -2,7 +2,7 @@ Utilities used by all code in this sub-directory """ -from typing import Any, Callable, Dict, Tuple +from typing import Any, Callable, Dict, Optional, Tuple import fixtures.pageserver.many_tenants as many_tenants from fixtures.common_types import TenantId, TimelineId @@ -22,7 +22,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): log.info("wait for all tenants to become active") wait_until_all_tenants_state( - ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False + ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False ) # ensure all layers are resident for predictiable performance @@ -41,6 +41,7 @@ def setup_pageserver_with_tenants( name: str, n_tenants: int, setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]], + timeout_in_seconds: Optional[int] = None, ) -> NeonEnv: """ Utility function to set up a pageserver with a given number of identical tenants. @@ -50,6 +51,6 @@ def setup_pageserver_with_tenants( return many_tenants.single_timeline(neon_env_builder, setup, n_tenants) env = neon_env_builder.build_and_use_snapshot(name, doit) - env.start() + env.start(timeout_in_seconds=timeout_in_seconds) ensure_pageserver_ready_for_benchmarking(env, n_tenants) return env diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 3f56da7c1d..69df7974b9 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,10 +1,9 @@ from contextlib import closing -import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare -from fixtures.pageserver.utils import wait_tenant_status_404 +from fixtures.log_helper import log from fixtures.pg_version import PgVersion @@ -18,7 +17,6 @@ from fixtures.pg_version import PgVersion # 3. Disk space used # 4. Peak memory usage # -@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124") def test_bulk_insert(neon_with_baseline: PgCompare): env = neon_with_baseline @@ -31,8 +29,8 @@ def test_bulk_insert(neon_with_baseline: PgCompare): # Run INSERT, recording the time and I/O it takes with env.record_pageserver_writes("pageserver_writes"): with env.record_duration("insert"): - cur.execute("insert into huge values (generate_series(1, 5000000), 0);") - env.flush() + cur.execute("insert into huge values (generate_series(1, 20000000), 0);") + env.flush(compact=False, gc=False) env.report_peak_memory_use() env.report_size() @@ -50,6 +48,9 @@ def test_bulk_insert(neon_with_baseline: PgCompare): if isinstance(env, NeonCompare): measure_recovery_time(env) + with env.record_duration("compaction"): + env.compact() + def measure_recovery_time(env: NeonCompare): client = env.env.pageserver.http_client() @@ -68,12 +69,13 @@ def measure_recovery_time(env: NeonCompare): (attach_gen, _) = attach_status client.tenant_delete(env.tenant) - wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5) env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen) # Measure recovery time with env.record_duration("wal_recovery"): + log.info("Entering recovery...") client.timeline_create(pg_version, env.tenant, env.timeline) # Flush, which will also wait for lsn to catch up - env.flush() + env.flush(compact=False, gc=False) + log.info("Finished recovery.") diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index 326c4f5c6f..3c6f0b0131 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -2,6 +2,7 @@ from contextlib import closing import pytest from fixtures.compare_fixtures import NeonCompare +from fixtures.log_helper import log from fixtures.neon_fixtures import wait_for_last_flush_lsn @@ -56,3 +57,98 @@ def test_compaction(neon_compare: NeonCompare): pageserver_http.timeline_compact(tenant_id, timeline_id) neon_compare.report_size() + + +def test_compaction_l0_memory(neon_compare: NeonCompare): + """ + Generate a large stack of L0s pending compaction into L1s, and + measure the pageserver's peak RSS while doing so + """ + + env = neon_compare.env + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # Initially disable compaction so that we will build up a stack of L0s + "compaction_period": "0s", + "gc_period": "0s", + } + ) + neon_compare.tenant = tenant_id + neon_compare.timeline = timeline_id + + endpoint = env.endpoints.create_start( + "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"] + ) + + # Read tenant effective config and assert on checkpoint_distance and compaction_threshold, + # as we do want to test with defaults (to be same as the field), but this test's workload size makes assumptions about them. + # + # If these assertions fail, it probably means we changed the default. + tenant_conf = pageserver_http.tenant_config(tenant_id) + assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024 + assert tenant_conf.effective_config["compaction_threshold"] == 10 + + # Aim to write about 20 L0s, so that we will hit the limit on how many + # to compact at once + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + for i in range(200): + cur.execute(f"create table tbl{i} (i int, j int);") + cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);") + for j in range(100): + cur.execute(f"update tbl{i} set j = {j};") + + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + endpoint.stop() + + # Check we have generated the L0 stack we expected + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + initial_l0s = len(layers.delta_l0_layers()) + initial_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers()) + log.info(f"l0s before compaction {initial_l0s} ({initial_l0s_size})") + + def rss_hwm(): + v = pageserver_http.get_metric_value("libmetrics_maxrss_kb") + assert v is not None + assert v > 0 + return v * 1024 + + before = rss_hwm() + pageserver_http.timeline_compact(tenant_id, timeline_id) + after = rss_hwm() + + log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})") + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + final_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers()) + log.info(f"l0s after compaction {len(layers.delta_l0_layers())} ({final_l0s_size})") + + assert after > before # If we didn't use some memory the test is probably buggy + compaction_mapped_rss = after - before + + # During L0 compaction, we require as much memory as the physical size of what we compacted, and then some, + # because the key->value mapping in L0s compaction is exhaustive, non-streaming, and does not de-duplicate + # repeated references to the same key. + # + # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which + # this memory estimate can be revised far downwards to something that doesn't scale + # linearly with the layer sizes. + MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.5 + + # If we find that compaction is using more memory, this may indicate a regression + assert compaction_mapped_rss < MEMORY_ESTIMATE + + # If we find that compaction is using <0.5 the expected memory then: + # - maybe we made a big efficiency improvement, in which case update the test + # - maybe something is functionally wrong with the test and it's not driving the system as expected + assert compaction_mapped_rss > MEMORY_ESTIMATE / 2 + + # We should have compacted some but not all of the l0s, based on the limit on how much + # l0 to compact in one go + assert len(layers.delta_l0_layers()) > 0 + assert len(layers.delta_l0_layers()) < initial_l0s + + # The pageserver should have logged when it hit the compaction size limit + env.pageserver.assert_log_contains(".*hit max delta layer size limit.*") diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index 9a03994b29..9861259c16 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -6,21 +6,8 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -@pytest.mark.timeout(10000) -def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): - """ - Test that GC is able to collect all old layers even if them are forming - "stairs" and there are not three delta layers since last image layer. - - Information about image layers needed to collect old layers should - be propagated by GC to compaction task which should take in in account - when make a decision which new image layers needs to be created. - - NB: this test demonstrates the problem. The source tree contained the - `gc_feedback` mechanism for about 9 months, but, there were problems - with it and it wasn't enabled at runtime. - This PR removed the code: https://github.com/neondatabase/neon/pull/6863 - """ +def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str): + assert mode == "normal" or mode == "with_snapshots" env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -33,7 +20,7 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma "checkpoint_distance": f"{1024 ** 2}", "compaction_target_size": f"{1024 ** 2}", # set PITR interval to be small, so we can do GC - "pitr_interval": "10 s", + "pitr_interval": "60 s", # "compaction_threshold": "3", # "image_creation_threshold": "2", } @@ -74,6 +61,9 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"] log.info(f"Physical storage size {physical_size}") + if mode == "with_snapshots": + if step == n_steps / 2: + env.neon_cli.create_branch("child") max_num_of_deltas_above_image = 0 max_total_num_of_deltas = 0 @@ -99,7 +89,87 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma MetricReport.LOWER_IS_BETTER, ) + client.timeline_compact(tenant_id, timeline_id, enhanced_gc_bottom_most_compaction=True) + tline_detail = client.timeline_detail(tenant_id, timeline_id) + logical_size = tline_detail["current_logical_size"] + physical_size = tline_detail["current_physical_size"] + + max_num_of_deltas_above_image = 0 + max_total_num_of_deltas = 0 + for key_range in client.perf_info(tenant_id, timeline_id): + max_total_num_of_deltas = max(max_total_num_of_deltas, key_range["total_num_of_deltas"]) + max_num_of_deltas_above_image = max( + max_num_of_deltas_above_image, key_range["num_of_deltas_above_image"] + ) + zenbenchmark.record( + "logical_size_after_bottom_most_compaction", + logical_size // MB, + "Mb", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "physical_size_after_bottom_most_compaction", + physical_size // MB, + "Mb", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "physical/logical ratio after bottom_most_compaction", + physical_size / logical_size, + "", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "max_total_num_of_deltas_after_bottom_most_compaction", + max_total_num_of_deltas, + "", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "max_num_of_deltas_above_image_after_bottom_most_compaction", + max_num_of_deltas_above_image, + "", + MetricReport.LOWER_IS_BETTER, + ) + + with endpoint.cursor() as cur: + cur.execute("SELECT * FROM t") # ensure data is not corrupted + layer_map_path = env.repo_dir / "layer-map.json" log.info(f"Writing layer map to {layer_map_path}") with layer_map_path.open("w") as f: f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id))) + + +@pytest.mark.timeout(10000) +def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + """ + Test that GC is able to collect all old layers even if them are forming + "stairs" and there are not three delta layers since last image layer. + + Information about image layers needed to collect old layers should + be propagated by GC to compaction task which should take in in account + when make a decision which new image layers needs to be created. + + NB: this test demonstrates the problem. The source tree contained the + `gc_feedback` mechanism for about 9 months, but, there were problems + with it and it wasn't enabled at runtime. + This PR removed the code: https://github.com/neondatabase/neon/pull/6863 + + And the bottom-most GC-compaction epic resolves the problem. + https://github.com/neondatabase/neon/issues/8002 + """ + gc_feedback_impl(neon_env_builder, zenbenchmark, "normal") + + +@pytest.mark.timeout(10000) +def test_gc_feedback_with_snapshots( + neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker +): + """ + Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle + of the benchmark, and the bottom-most compaction should collect as much garbage as possible below the GC + horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point, + and images covering the full key range (in a delta layer) at the GC horizon. + """ + gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots") diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py index d9785dd87e..5e97c7cddf 100644 --- a/test_runner/performance/test_hot_page.py +++ b/test_runner/performance/test_hot_page.py @@ -16,20 +16,34 @@ from pytest_lazyfixture import lazy_fixture ) def test_hot_page(env: PgCompare): # Update the same page many times, then measure read performance - num_writes = 1000000 with closing(env.pg.connect()) as conn: with conn.cursor() as cur: cur.execute("drop table if exists t, f;") + num_writes = 1000000 - # Write many updates to the same row + # Use a PL/pgSQL block to perform many updates to the same row + # without depending on the latency between database client and postgres + # server + # - however a single staement should not run into a timeout so we increase it + cur.execute("SET statement_timeout = '4h';") with env.record_duration("write"): - cur.execute("create table t (i integer);") - cur.execute("insert into t values (0);") - for i in range(num_writes): - cur.execute(f"update t set i = {i};") + cur.execute( + f""" + DO $$ + BEGIN + create table t (i integer); + insert into t values (0); - # Write 3-4 MB to evict t from compute cache + FOR j IN 1..{num_writes} LOOP + update t set i = j; + END LOOP; + END $$; + """ + ) + + # Write ca 350 MB to evict t from compute shared buffers (128 MB) + # however it will still be in LFC, so I do not really understand the point of this test cur.execute("create table f (i integer);") cur.execute("insert into f values (generate_series(1,100000));") diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py index 5fcffc8afb..9a78c92ec0 100644 --- a/test_runner/performance/test_hot_table.py +++ b/test_runner/performance/test_hot_table.py @@ -16,8 +16,8 @@ from pytest_lazyfixture import lazy_fixture ) def test_hot_table(env: PgCompare): # Update a small table many times, then measure read performance - num_rows = 100000 # Slightly larger than shared buffers size TODO validate - num_writes = 1000000 + num_rows = 100000 # initial table size only about 4 MB + num_writes = 10000000 # write approximately 349 MB blocks > 128 MB shared_buffers num_reads = 10 with closing(env.pg.connect()) as conn: @@ -28,8 +28,21 @@ def test_hot_table(env: PgCompare): with env.record_duration("write"): cur.execute("create table t (i integer primary key);") cur.execute(f"insert into t values (generate_series(1,{num_rows}));") - for i in range(num_writes): - cur.execute(f"update t set i = {i + num_rows} WHERE i = {i};") + # PL/pgSQL block to perform updates (and avoid latency between client and server) + # - however a single staement should not run into a timeout so we increase it + cur.execute("SET statement_timeout = '4h';") + cur.execute( + f""" + DO $$ + DECLARE + r integer := {num_rows}; + BEGIN + FOR j IN 1..{num_writes} LOOP + UPDATE t SET i = j + r WHERE i = j; + END LOOP; + END $$; + """ + ) # Read the table with env.record_duration("read"): diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index 9b20954d45..bc6d9de346 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -1,20 +1,21 @@ import time -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver -# -# Benchmark searching the layer map, when there are a lot of small layer files. -# def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): - env = neon_env_builder.init_start() + """Benchmark searching the layer map, when there are a lot of small layer files.""" + + env = neon_env_builder.init_configs() n_iters = 10 n_records = 100000 + env.start() + # We want to have a lot of lot of layer files to exercise the layer map. Disable # GC, and make checkpoint_distance very small, so that we get a lot of small layer # files. - tenant, _ = env.neon_cli.create_tenant( + tenant, timeline = env.neon_cli.create_tenant( conf={ "gc_period": "0s", "checkpoint_distance": "16384", @@ -24,8 +25,7 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): } ) - env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant) - endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant) + endpoint = env.endpoints.create_start("main", tenant_id=tenant) cur = endpoint.connect().cursor() cur.execute("create table t(x integer)") for _ in range(n_iters): @@ -33,6 +33,12 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): time.sleep(1) cur.execute("vacuum t") + with zenbenchmark.record_duration("test_query"): cur.execute("SELECT count(*) from t") assert cur.fetchone() == (n_iters * n_records,) + + flush_ep_to_pageserver(env, endpoint, tenant, timeline) + env.pageserver.http_client().timeline_checkpoint( + tenant, timeline, compact=False, wait_until_uploaded=True + ) diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index b799f7248f..077f73ac06 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -1,10 +1,23 @@ +from __future__ import annotations + import time +from typing import TYPE_CHECKING +import psycopg2 +import psycopg2.extras import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, PgBin, logical_replication_sync +from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync + +if TYPE_CHECKING: + from fixtures.benchmark_fixture import NeonBenchmarker + from fixtures.neon_api import NeonApiEndpoint + from fixtures.neon_fixtures import NeonEnv, PgBin +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2]) @pytest.mark.timeout(1000) def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg): env = neon_simple_env @@ -25,7 +38,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg vanilla_pg.safe_psql("truncate table pgbench_history") connstr = endpoint.connstr().replace("'", "''") - print(f"connstr='{connstr}'") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") # Wait logical replication channel to be established @@ -41,3 +53,295 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0] sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0] assert sum_master == sum_replica + + +def check_pgbench_still_running(pgbench, label=""): + rc = pgbench.poll() + if rc is not None: + raise RuntimeError(f"{label} pgbench terminated early with return code {rc}") + + +def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600): + start = time.time() + pub_cur.execute("SELECT pg_current_wal_flush_lsn()") + pub_lsn = Lsn(pub_cur.fetchall()[0][0]) + while (time.time() - start) < timeout_sec: + sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription") + res = sub_cur.fetchall()[0][0] + if res: + log.info(f"subscriber_lsn={res}") + sub_lsn = Lsn(res) + log.info(f"Subscriber LSN={sub_lsn}, publisher LSN={pub_lsn}") + if sub_lsn >= pub_lsn: + return time.time() - start + time.sleep(0.5) + raise TimeoutError(f"Logical replication sync took more than {timeout_sec} sec") + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_subscriber_lag( + pg_bin: PgBin, + benchmark_project_pub: NeonApiEndpoint, + benchmark_project_sub: NeonApiEndpoint, + zenbenchmark: NeonBenchmarker, +): + """ + Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects + on subscriber. Periodically restarts subscriber while still running the inserts, and + measures how long sync takes after restart. + """ + test_duration_min = 60 + sync_interval_min = 5 + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + pub_env = benchmark_project_pub.pgbench_env + sub_env = benchmark_project_sub.pgbench_env + pub_connstr = benchmark_project_pub.connstr + sub_connstr = benchmark_project_sub.connstr + + if benchmark_project_pub.is_new: + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + if benchmark_project_sub.is_new: + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'") + pub_exists = len(pub_cur.fetchall()) != 0 + + if not pub_exists: + pub_cur.execute("CREATE PUBLICATION pub1 FOR TABLE pgbench_accounts, pgbench_history") + + sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'") + sub_exists = len(sub_cur.fetchall()) != 0 + if not sub_exists: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + sub_cur.execute(f"CREATE SUBSCRIPTION sub1 CONNECTION '{pub_connstr}' PUBLICATION pub1") + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + + pub_conn.close() + sub_conn.close() + + zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) + try: + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") + + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + sub_workload.terminate() + benchmark_project_sub.restart() + + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + + # Measure storage to make sure replication information isn't bloating storage + sub_storage = benchmark_project_sub.get_synthetic_storage_size() + pub_storage = benchmark_project_pub.get_synthetic_storage_size() + zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER) + finally: + sub_workload.terminate() + finally: + pub_workload.terminate() + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_publisher_restart( + pg_bin: PgBin, + benchmark_project_pub: NeonApiEndpoint, + benchmark_project_sub: NeonApiEndpoint, + zenbenchmark: NeonBenchmarker, +): + """ + Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects + on subscriber. Periodically restarts publisher (to exercise on-demand WAL download), and + measures how long sync takes after restart. + """ + test_duration_min = 60 + sync_interval_min = 5 + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + pub_env = benchmark_project_pub.pgbench_env + sub_env = benchmark_project_sub.pgbench_env + pub_connstr = benchmark_project_pub.connstr + sub_connstr = benchmark_project_sub.connstr + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'") + pub_exists = len(pub_cur.fetchall()) != 0 + + if not pub_exists: + pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history") + + sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'") + sub_exists = len(sub_cur.fetchall()) != 0 + if not sub_exists: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1") + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() + sub_conn.close() + + zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) + try: + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") + + pub_workload.terminate() + benchmark_project_pub.restart() + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], + env=pub_env, + ) + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + + # Measure storage to make sure replication information isn't bloating storage + sub_storage = benchmark_project_sub.get_synthetic_storage_size() + pub_storage = benchmark_project_pub.get_synthetic_storage_size() + zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER) + finally: + sub_workload.terminate() + finally: + pub_workload.terminate() + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_snap_files( + pg_bin: PgBin, + benchmark_project_pub: NeonApiEndpoint, + zenbenchmark: NeonBenchmarker, +): + """ + Creates a node with a replication slot. Generates pgbench into the replication slot, + then runs pgbench inserts while generating large numbers of snapfiles. Then restarts + the node and tries to peek the replication changes. + """ + test_duration_min = 60 + test_interval_min = 5 + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + env = benchmark_project_pub.pgbench_env + connstr = benchmark_project_pub.connstr + + with psycopg2.connect(connstr) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'") + is_super = cur.fetchall()[0][0] + assert is_super, "This benchmark won't work if we don't have superuser" + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env) + + conn = psycopg2.connect(connstr) + conn.autocommit = True + cur = conn.cursor() + cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1") + + with psycopg2.connect(connstr) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute("SELECT pg_reload_conf()") + + with psycopg2.connect(connstr) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute( + """ + DO $$ + BEGIN + IF EXISTS ( + SELECT 1 + FROM pg_replication_slots + WHERE slot_name = 'slotter' + ) THEN + PERFORM pg_drop_replication_slot('slotter'); + END IF; + END $$; + """ + ) + cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')") + + workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env) + try: + start = time.time() + prev_measurement = time.time() + while time.time() - start < test_duration_min * 60: + with psycopg2.connect(connstr) as conn: + with conn.cursor() as cur: + cur.execute( + "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s" + ) + check_pgbench_still_running(workload) + cur.execute( + "SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())" + ) + + # Measure storage + if time.time() - prev_measurement > test_interval_min * 60: + storage = benchmark_project_pub.get_synthetic_storage_size() + zenbenchmark.record("storage", storage, "B", MetricReport.LOWER_IS_BETTER) + prev_measurement = time.time() + time.sleep(test_interval_min * 60 / 3) + + finally: + workload.terminate() diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py new file mode 100644 index 0000000000..7e16197211 --- /dev/null +++ b/test_runner/performance/test_physical_replication.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import csv +import os +import subprocess +import time +import traceback +from pathlib import Path +from typing import TYPE_CHECKING + +import psycopg2 +import psycopg2.extras +import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn +from fixtures.log_helper import log +from fixtures.neon_api import connection_parameters_to_env +from fixtures.pg_version import PgVersion + +if TYPE_CHECKING: + from typing import Any, List, Optional + + from fixtures.benchmark_fixture import NeonBenchmarker + from fixtures.neon_api import NeonAPI + from fixtures.neon_fixtures import PgBin + + +# Granularity of ~0.5 sec +def measure_replication_lag(master, replica, timeout_sec=600): + start = time.time() + master.execute("SELECT pg_current_wal_flush_lsn()") + master_lsn = Lsn(master.fetchall()[0][0]) + while (time.time() - start) < timeout_sec: + replica.execute("select pg_last_wal_replay_lsn()") + replica_lsn = replica.fetchall()[0][0] + if replica_lsn: + if Lsn(replica_lsn) >= master_lsn: + return time.time() - start + time.sleep(0.5) + raise TimeoutError(f"Replication sync took more than {timeout_sec} sec") + + +def check_pgbench_still_running(pgbench): + rc = pgbench.poll() + if rc is not None: + raise RuntimeError(f"Pgbench terminated early with return code {rc}") + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_ro_replica_lag( + pg_bin: PgBin, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + test_duration_min = 60 + sync_interval_min = 10 + + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + project = neon_api.create_project(pg_version) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + error_occurred = False + try: + branch_id = project["branch"]["id"] + master_connstr = project["connection_uris"][0]["connection_uri"] + master_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + + replica = neon_api.create_endpoint( + project_id, + branch_id, + endpoint_type="read_only", + settings={"pg_settings": {"hot_standby_feedback": "on"}}, + ) + replica_env = master_env.copy() + replica_env["PGHOST"] = replica["endpoint"]["host"] + neon_api.wait_for_operation_to_finish(project_id) + + replica_connstr = neon_api.get_connection_uri( + project_id, + endpoint_id=replica["endpoint"]["id"], + )["uri"] + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env) + + master_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], + env=master_env, + ) + try: + replica_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=replica_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + check_pgbench_still_running(master_workload) + check_pgbench_still_running(replica_workload) + time.sleep(sync_interval_min * 60) + with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect( + replica_connstr + ) as conn_replica: + with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica: + lag = measure_replication_lag(cur_master, cur_replica) + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + finally: + replica_workload.terminate() + finally: + master_workload.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception: {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred # Fail the test if an error occurred + neon_api.delete_project(project_id) + + +def report_pgbench_aggregate_intervals( + output_dir: Path, + prefix: str, + zenbenchmark: NeonBenchmarker, +): + for filename in os.listdir(output_dir): + if filename.startswith(prefix): + # The file will be in the form _. + # So we first lop off the ., and then lop off the prefix and the _ + node = filename.split(".")[0][len(prefix) + 1 :] + with open(output_dir / filename) as f: + reader = csv.reader(f, delimiter=" ") + for line in reader: + num_transactions = int(line[1]) + if num_transactions == 0: + continue + sum_latency = int(line[2]) + sum_lag = int(line[3]) + zenbenchmark.record( + f"{node}_num_txns", num_transactions, "txns", MetricReport.HIGHER_IS_BETTER + ) + zenbenchmark.record( + f"{node}_avg_latency", + sum_latency / num_transactions, + "s", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + f"{node}_avg_lag", + sum_lag / num_transactions, + "s", + MetricReport.LOWER_IS_BETTER, + ) + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_replication_start_stop( + pg_bin: PgBin, + test_output_dir: Path, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + """ + Cycles through different configurations of read replicas being enabled disabled. The whole time, + there's a pgbench read/write workload going on the master. For each replica, we either turn it + on or off, and see how long it takes to catch up after some set amount of time of replicating + the pgbench. + """ + + prefix = "pgbench_agg" + num_replicas = 2 + configuration_test_time_sec = 10 * 60 + pgbench_duration = f"-T{2 ** num_replicas * configuration_test_time_sec}" + error_occurred = False + + project = neon_api.create_project(pg_version) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + try: + branch_id = project["branch"]["id"] + master_connstr = project["connection_uris"][0]["connection_uri"] + master_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + + replicas = [] + for _ in range(num_replicas): + replicas.append( + neon_api.create_endpoint( + project_id, + branch_id, + endpoint_type="read_only", + settings={"pg_settings": {"hot_standby_feedback": "on"}}, + ) + ) + neon_api.wait_for_operation_to_finish(project_id) + + replica_connstr = [ + neon_api.get_connection_uri( + project_id, + endpoint_id=replicas[i]["endpoint"]["id"], + )["uri"] + for i in range(num_replicas) + ] + replica_env = [master_env.copy() for _ in range(num_replicas)] + for i in range(num_replicas): + replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"] + + pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env) + + # Sync replicas + with psycopg2.connect(master_connstr) as conn_master: + with conn_master.cursor() as cur_master: + for i in range(num_replicas): + conn_replica = psycopg2.connect(replica_connstr[i]) + measure_replication_lag(cur_master, conn_replica.cursor()) + + master_pgbench = pg_bin.run_nonblocking( + [ + "pgbench", + "-c10", + pgbench_duration, + "-Mprepared", + "--log", + f"--log-prefix={test_output_dir}/{prefix}_master", + f"--aggregate-interval={configuration_test_time_sec}", + ], + env=master_env, + ) + replica_pgbench: List[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)] + + # Use the bits of iconfig to tell us which configuration we are on. For example + # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is + # alive. + for iconfig in range((1 << num_replicas) - 1, -1, -1): + + def replica_enabled(iconfig: int = iconfig): + return bool((iconfig >> 1) & 1) + + # Change configuration + for ireplica in range(num_replicas): + if replica_enabled() and replica_pgbench[ireplica] is None: + replica_pgbench[ireplica] = pg_bin.run_nonblocking( + [ + "pgbench", + "-c10", + "-S", + pgbench_duration, + "--log", + f"--log-prefix={test_output_dir}/{prefix}_replica_{ireplica}", + f"--aggregate-interval={configuration_test_time_sec}", + ], + env=replica_env[ireplica], + ) + elif not replica_enabled() and replica_pgbench[ireplica] is not None: + pgb = replica_pgbench[ireplica] + assert pgb is not None + pgb.terminate() + pgb.wait() + replica_pgbench[ireplica] = None + + neon_api.suspend_endpoint( + project_id, + replicas[ireplica]["endpoint"]["id"], + ) + neon_api.wait_for_operation_to_finish(project_id) + + time.sleep(configuration_test_time_sec) + + with psycopg2.connect(master_connstr) as conn_master: + with conn_master.cursor() as cur_master: + for ireplica in range(num_replicas): + replica_conn = psycopg2.connect(replica_connstr[ireplica]) + lag = measure_replication_lag(cur_master, replica_conn.cursor()) + zenbenchmark.record( + f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER + ) + log.info( + f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}" + ) + master_pgbench.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred + neon_api.delete_project(project_id) + # Only report results if we didn't error out + report_pgbench_aggregate_intervals(test_output_dir, prefix, zenbenchmark) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index cb013ae8c3..297aedfbed 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -1,18 +1,56 @@ import concurrent.futures import random import time +from collections import defaultdict import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( + NeonEnv, NeonEnvBuilder, + PageserverAvailability, + PageserverSchedulingPolicy, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pg_version import PgVersion +def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]: + """ + Get the number of shards attached to each node. + This function takes into account the intersection of the intent and the observed state. + If they do not match, it asserts out. + """ + tenant_placement = env.storage_controller.get_tenants_placement() + log.info(f"{tenant_placement=}") + + matching = { + tid: tenant_placement[tid]["intent"]["attached"] + for tid in tenant_placement + if tenant_placement[tid]["intent"]["attached"] + == tenant_placement[tid]["observed"]["attached"] + } + assert len(matching) == total_shards + + attached_per_node: defaultdict[str, int] = defaultdict(int) + for node_id in matching.values(): + attached_per_node[node_id] += 1 + + return attached_per_node + + +def assert_consistent_balanced_attachments(env: NeonEnv, total_shards): + attached_per_node = get_consistent_node_shard_counts(env, total_shards) + + min_shard_count = min(attached_per_node.values()) + max_shard_count = max(attached_per_node.values()) + + flake_factor = 5 / 100 + assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + + @pytest.mark.timeout(3600) # super long running test: should go down as we optimize def test_storage_controller_many_tenants( neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure @@ -35,7 +73,8 @@ def test_storage_controller_many_tenants( # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to # guard against regressions in restart time. - "max_unavailable": "300s" + "max_offline": "30s", + "max_warming_up": "300s", } neon_env_builder.control_plane_compute_hook_api = ( compute_reconfigure_listener.control_plane_compute_hook_api @@ -44,21 +83,23 @@ def test_storage_controller_many_tenants( # A small sleep on each call into the notify hook, to simulate the latency of doing a database write compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) - env = neon_env_builder.init_start() + env = neon_env_builder.init_configs() + neon_env_builder.start() # We will intentionally stress reconciler concurrrency, which triggers a warning when lots # of shards are hitting the delayed path. - env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile") + env.storage_controller.allowed_errors.extend( + [ + # We will intentionally stress reconciler concurrrency, which triggers a warning when lots + # of shards are hitting the delayed path. + ".*Many shards are waiting to reconcile", + # We will create many timelines concurrently, so they might get slow enough to trip the warning + # that timeline creation is holding a lock too long. + ".*Shared lock by TimelineCreate.*was held.*", + ] + ) for ps in env.pageservers: - # This can happen because when we do a loop over all pageservers and mark them offline/active, - # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of - # bumping generation before other attachments are detached. - # - # We could clean this up by making reconcilers respect the .observed of their predecessor, if - # we spawn with a wait for the predecessor. - ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # Storage controller is allowed to drop pageserver requests when the cancellation token # for a Reconciler fires. ps.allowed_errors.append(".*request was dropped before completing.*") @@ -70,6 +111,8 @@ def test_storage_controller_many_tenants( shard_count = 2 stripe_size = 1024 + total_shards = tenant_count * shard_count + tenants = set(TenantId.generate() for _i in range(0, tenant_count)) virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) @@ -136,7 +179,11 @@ def test_storage_controller_many_tenants( # A reconciler operation: migrate a shard. shard_number = rng.randint(0, shard_count - 1) tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count) - dest_ps_id = rng.choice([ps.id for ps in env.pageservers]) + + # Migrate it to its secondary location + desc = env.storage_controller.tenant_describe(tenant_id) + dest_ps_id = desc["shards"][shard_number]["node_secondary"][0] + f = executor.submit( env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id ) @@ -150,7 +197,11 @@ def test_storage_controller_many_tenants( for f in futs: f.result() - # Consistency check is safe here: all the previous operations waited for reconcile before completing + # Some of the operations above (notably migrations) might leave the controller in a state where it has + # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system + # to reach a quiescent state before doing following checks. + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() check_memory() @@ -186,10 +237,60 @@ def test_storage_controller_many_tenants( env.storage_controller.consistency_check() check_memory() - # Restart pageservers: this exercises the /re-attach API - for pageserver in env.pageservers: - pageserver.stop() - pageserver.start() + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts before rolling restart: {shard_counts}") + + assert_consistent_balanced_attachments(env, total_shards) + + # Restart pageservers gracefully: this exercises the /re-attach pageserver API + # and the storage controller drain and fill API + for ps in env.pageservers: + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=24, + backoff=5, + ) + + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") + # Assert that we've drained the node + assert shard_counts[str(ps.id)] == 0 + # Assert that those shards actually went somewhere + assert sum(shard_counts.values()) == total_shards + + ps.restart() + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=24, + backoff=1, + ) + + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 + ) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=24, + backoff=5, + ) + + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") + + assert_consistent_balanced_attachments(env, total_shards) + + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, # as they were not offline long enough to trigger any scheduling changes. diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile index 7e074e07b8..7c2b1b40e0 100644 --- a/test_runner/pg_clients/java/jdbc/Dockerfile +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -1,4 +1,4 @@ -FROM openjdk:21 +FROM openjdk:22 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt index e086a937e6..099a4ade2c 100644 --- a/test_runner/pg_clients/python/pg8000/requirements.txt +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -1,2 +1,2 @@ -pg8000==1.30.5 +pg8000==1.31.2 scramp>=1.4.3 diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index a4a2426b97..354fc15745 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.21.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" dependencies = [ "gimli", ] @@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "async-trait" -version = "0.1.77" +version = "0.1.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" +checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" dependencies = [ "proc-macro2", "quote", @@ -30,15 +30,15 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.1.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" [[package]] name = "backtrace" -version = "0.3.69" +version = "0.3.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" dependencies = [ "addr2line", "cc", @@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.2" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "block-buffer" @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.15.3" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "byteorder" @@ -90,15 +90,15 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" [[package]] name = "cc" -version = "1.0.89" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723" +checksum = "ac367972e516d45567c7eafc73d24e1c193dcf200a8d94e9db7b3d38b349572d" [[package]] name = "cfg-if" @@ -154,9 +154,9 @@ dependencies = [ [[package]] name = "errno" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ "libc", "windows-sys 0.52.0", @@ -170,15 +170,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "2.0.1" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" - -[[package]] -name = "finl_unicode" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "foreign-types" @@ -296,9 +290,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.12" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", @@ -307,9 +301,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.28.1" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" [[package]] name = "hmac" @@ -329,29 +323,23 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - [[package]] name = "libc" -version = "0.2.153" +version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "lock_api" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ "autocfg", "scopeguard", @@ -375,15 +363,15 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.1" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "miniz_oxide" -version = "0.7.2" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" dependencies = [ "adler", ] @@ -401,11 +389,10 @@ dependencies = [ [[package]] name = "native-tls" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" dependencies = [ - "lazy_static", "libc", "log", "openssl", @@ -419,9 +406,9 @@ dependencies = [ [[package]] name = "object" -version = "0.32.2" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434" dependencies = [ "memchr", ] @@ -434,11 +421,11 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.64" +version = "0.10.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" +checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" dependencies = [ - "bitflags 2.4.2", + "bitflags 2.6.0", "cfg-if", "foreign-types", "libc", @@ -466,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.101" +version = "0.9.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff" +checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" dependencies = [ "cc", "libc", @@ -478,9 +465,9 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -488,15 +475,15 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.5.2", "smallvec", - "windows-targets 0.48.5", + "windows-targets 0.52.5", ] [[package]] @@ -525,9 +512,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.13" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" [[package]] name = "pin-utils" @@ -591,18 +578,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -646,6 +633,15 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_syscall" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" +dependencies = [ + "bitflags 2.6.0", +] + [[package]] name = "rust-neon-example" version = "0.1.0" @@ -658,17 +654,17 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustix" -version = "0.38.31" +version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags 2.4.2", + "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", @@ -692,11 +688,11 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "2.9.2" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", "core-foundation", "core-foundation-sys", "libc", @@ -705,9 +701,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.9.1" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7" dependencies = [ "core-foundation-sys", "libc", @@ -741,15 +737,15 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.1" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "socket2" -version = "0.5.6" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" dependencies = [ "libc", "windows-sys 0.52.0", @@ -757,26 +753,26 @@ dependencies = [ [[package]] name = "stringprep" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" dependencies = [ - "finl_unicode", "unicode-bidi", "unicode-normalization", + "unicode-properties", ] [[package]] name = "subtle" -version = "2.5.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.52" +version = "2.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" +checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" dependencies = [ "proc-macro2", "quote", @@ -797,9 +793,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "c55115c6fbe2d2bef26eb09ad74bde02d8255476fc0c7b515ef09fbb35742d82" dependencies = [ "tinyvec_macros", ] @@ -812,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.36.0" +version = "1.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" dependencies = [ "backtrace", "bytes", @@ -828,9 +824,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", @@ -875,35 +871,15 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" +checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" dependencies = [ "bytes", "futures-core", "futures-sink", "pin-project-lite", "tokio", - "tracing", -] - -[[package]] -name = "tracing" -version = "0.1.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" -dependencies = [ - "pin-project-lite", - "tracing-core", -] - -[[package]] -name = "tracing-core" -version = "0.1.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" -dependencies = [ - "once_cell", ] [[package]] @@ -933,6 +909,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-properties" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291" + [[package]] name = "vcpkg" version = "0.2.15" @@ -1023,11 +1005,11 @@ dependencies = [ [[package]] name = "whoami" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e" +checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9" dependencies = [ - "redox_syscall", + "redox_syscall 0.4.1", "wasite", "web-sys", ] @@ -1047,7 +1029,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.5", ] [[package]] @@ -1067,17 +1049,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm 0.52.5", + "windows_aarch64_msvc 0.52.5", + "windows_i686_gnu 0.52.5", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.5", + "windows_x86_64_gnu 0.52.5", + "windows_x86_64_gnullvm 0.52.5", + "windows_x86_64_msvc 0.52.5", ] [[package]] @@ -1088,9 +1071,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" [[package]] name = "windows_aarch64_msvc" @@ -1100,9 +1083,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" [[package]] name = "windows_i686_gnu" @@ -1112,9 +1095,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" [[package]] name = "windows_i686_msvc" @@ -1124,9 +1113,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" [[package]] name = "windows_x86_64_gnu" @@ -1136,9 +1125,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" [[package]] name = "windows_x86_64_gnullvm" @@ -1148,9 +1137,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" [[package]] name = "windows_x86_64_msvc" @@ -1160,6 +1149,6 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml index 0f420e5b06..27d01810bd 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml @@ -7,9 +7,9 @@ publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -native-tls = "0.2.11" +native-tls = "0.2.12" postgres-native-tls = "0.5.0" -tokio = { version = "1.36", features=["rt", "macros"] } +tokio = { version = "1.38", features=["rt", "macros"] } tokio-postgres = "0.7.10" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile index 8611e66cbb..3e214de785 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.76 +FROM rust:1.79 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile index 0402838820..6006e61ee2 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -1,11 +1,11 @@ -FROM swift:5.9 AS build +FROM swift:5.10 AS build RUN apt-get -q update && apt-get -q install -y libssl-dev WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.9 +FROM swift:5.10 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved index 767443a9dd..6e8613095f 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved @@ -1,4 +1,5 @@ { + "originHash" : "8eff8c577ba246ce7824d3434839acefced2b1a1d2b1ad700554502538a50558", "pins" : [ { "identity" : "bluesocket", @@ -18,15 +19,6 @@ "version" : "2.0.2" } }, - { - "identity" : "openssl", - "kind" : "remoteSourceControl", - "location" : "https://github.com/Kitura/OpenSSL.git", - "state" : { - "revision" : "5dc8cb4f971135c17343e3c6df4f28904a0600e2", - "version" : "2.3.1" - } - }, { "identity" : "postgresclientkit", "kind" : "remoteSourceControl", @@ -37,5 +29,5 @@ } } ], - "version" : 2 + "version" : 3 } diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift index 48320dd023..a66d09c542 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift @@ -1,4 +1,4 @@ -// swift-tools-version:5.8 +// swift-tools-version:5.10 import PackageDescription let package = Package( diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile index 9130e0973f..d6815fbb5f 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile @@ -1,10 +1,10 @@ -FROM swift:5.9 AS build +FROM swift:5.10 AS build WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.9 +FROM swift:5.10 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresNIOExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved index 023e03a7b1..0e5dfdafcb 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved @@ -1,12 +1,22 @@ { + "originHash" : "11b5dcece349a3e56a7a9a7d0af6d0f5b83dff321b43124a01b158ed7aac5302", "pins" : [ { "identity" : "postgres-nio", "kind" : "remoteSourceControl", "location" : "https://github.com/vapor/postgres-nio.git", "state" : { - "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f", - "version" : "1.20.2" + "revision" : "5c268768890b062803a49f1358becc478f954265", + "version" : "1.21.5" + } + }, + { + "identity" : "swift-async-algorithms", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-async-algorithms.git", + "state" : { + "revision" : "da4e36f86544cdf733a40d59b3a2267e3a7bbf36", + "version" : "1.0.0" } }, { @@ -81,6 +91,15 @@ "version" : "1.20.1" } }, + { + "identity" : "swift-service-lifecycle", + "kind" : "remoteSourceControl", + "location" : "https://github.com/swift-server/swift-service-lifecycle.git", + "state" : { + "revision" : "d58e6bf2b1ae2884cf204a8b5bcaaa7aae3c1ff0", + "version" : "2.6.0" + } + }, { "identity" : "swift-system", "kind" : "remoteSourceControl", @@ -91,5 +110,5 @@ } } ], - "version" : 2 + "version" : 3 } diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift index 637eb4bc9d..20bb10f76c 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift @@ -1,10 +1,10 @@ -// swift-tools-version:5.9 +// swift-tools-version:5.10 import PackageDescription let package = Package( name: "PostgresNIOExample", dependencies: [ - .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2") + .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.21.5") ], targets: [ .executableTarget( diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile index 004b383749..45e8753f7e 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -1,4 +1,4 @@ -FROM node:21 +FROM node:22 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json index b4f8587eac..19311808b6 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -5,7 +5,7 @@ "packages": { "": { "dependencies": { - "postgresql-client": "2.10.5" + "postgresql-client": "2.11.0" } }, "node_modules/doublylinked": { @@ -42,9 +42,10 @@ } }, "node_modules/postgresql-client": { - "version": "2.10.5", - "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz", - "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==", + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.11.0.tgz", + "integrity": "sha512-QSPHcWVaiBG+JyASaDojOXvhRmsc2n8j2COdIjUDENFAtFls16Zy240asY2ENzZRQJUMAA8vpR8w4SAdI8jdbw==", + "license": "MIT", "dependencies": { "doublylinked": "^2.5.4", "lightning-pool": "^4.2.2", @@ -55,8 +56,7 @@ "putil-varhelpers": "^1.6.5" }, "engines": { - "node": ">=16.0", - "npm": ">=7.0.0" + "node": ">=16.0" } }, "node_modules/power-tasks": { diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json index 07ec100d0d..d2bba23d29 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -1,6 +1,6 @@ { "type": "module", "dependencies": { - "postgresql-client": "2.10.5" + "postgresql-client": "2.11.0" } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile index 004b383749..45e8753f7e 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile +++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile @@ -1,4 +1,4 @@ -FROM node:21 +FROM node:22 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json index f3b456f1ed..7f3f7f2e84 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json @@ -5,96 +5,138 @@ "packages": { "": { "dependencies": { - "@neondatabase/serverless": "0.9.0", + "@neondatabase/serverless": "0.9.4", "ws": "8.17.1" } }, "node_modules/@neondatabase/serverless": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz", - "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==", + "version": "0.9.4", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.4.tgz", + "integrity": "sha512-D0AXgJh6xkf+XTlsO7iwE2Q1w8981E1cLCPAALMU2YKtkF/1SF6BiAzYARZFYo175ON+b1RNIy9TdSFHm5nteg==", + "license": "MIT", "dependencies": { - "@types/pg": "8.6.6" + "@types/pg": "8.11.6" } }, "node_modules/@types/node": { - "version": "18.16.3", - "resolved": "https://registry.npmjs.org/@types/node/-/node-18.16.3.tgz", - "integrity": "sha512-OPs5WnnT1xkCBiuQrZA4+YAV4HEJejmHneyraIaxsbev5yCEr6KMwINNFP9wQeFIw8FWcoTqF3vQsa5CDaI+8Q==" + "version": "20.14.9", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.9.tgz", + "integrity": "sha512-06OCtnTXtWOZBJlRApleWndH4JsRVs1pDCc8dLSQp+7PpUpX3ePdHyeNSFTeSe7FtKyQkrlPvHwJOW3SLd8Oyg==", + "license": "MIT", + "dependencies": { + "undici-types": "~5.26.4" + } }, "node_modules/@types/pg": { - "version": "8.6.6", - "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.6.6.tgz", - "integrity": "sha512-O2xNmXebtwVekJDD+02udOncjVcMZQuTEQEMpKJ0ZRf5E7/9JJX3izhKUcUifBkyKpljyUM6BTgy2trmviKlpw==", + "version": "8.11.6", + "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.11.6.tgz", + "integrity": "sha512-/2WmmBXHLsfRqzfHW7BNZ8SbYzE8OSk7i3WjFYvfgRHj7S1xj+16Je5fUKv3lVdVzk/zn9TXOqf+avFCFIE0yQ==", + "license": "MIT", "dependencies": { "@types/node": "*", "pg-protocol": "*", - "pg-types": "^2.2.0" + "pg-types": "^4.0.1" } }, + "node_modules/obuf": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", + "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==", + "license": "MIT" + }, "node_modules/pg-int8": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz", "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==", + "license": "ISC", "engines": { "node": ">=4.0.0" } }, - "node_modules/pg-protocol": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.0.tgz", - "integrity": "sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q==" - }, - "node_modules/pg-types": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz", - "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==", - "dependencies": { - "pg-int8": "1.0.1", - "postgres-array": "~2.0.0", - "postgres-bytea": "~1.0.0", - "postgres-date": "~1.0.4", - "postgres-interval": "^1.1.0" - }, + "node_modules/pg-numeric": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/pg-numeric/-/pg-numeric-1.0.2.tgz", + "integrity": "sha512-BM/Thnrw5jm2kKLE5uJkXqqExRUY/toLHda65XgFTBTFYZyopbKjBe29Ii3RbkvlsMoFwD+tHeGaCjjv0gHlyw==", + "license": "ISC", "engines": { "node": ">=4" } }, + "node_modules/pg-protocol": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.1.tgz", + "integrity": "sha512-jPIlvgoD63hrEuihvIg+tJhoGjUsLPn6poJY9N5CnlPd91c2T18T/9zBtLxZSb1EhYxBRoZJtzScCaWlYLtktg==", + "license": "MIT" + }, + "node_modules/pg-types": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-4.0.2.tgz", + "integrity": "sha512-cRL3JpS3lKMGsKaWndugWQoLOCoP+Cic8oseVcbr0qhPzYD5DWXK+RZ9LY9wxRf7RQia4SCwQlXk0q6FCPrVng==", + "license": "MIT", + "dependencies": { + "pg-int8": "1.0.1", + "pg-numeric": "1.0.2", + "postgres-array": "~3.0.1", + "postgres-bytea": "~3.0.0", + "postgres-date": "~2.1.0", + "postgres-interval": "^3.0.0", + "postgres-range": "^1.1.1" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/postgres-array": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz", - "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-3.0.2.tgz", + "integrity": "sha512-6faShkdFugNQCLwucjPcY5ARoW1SlbnrZjmGl0IrrqewpvxvhSLHimCVzqeuULCbG0fQv7Dtk1yDbG3xv7Veog==", + "license": "MIT", "engines": { - "node": ">=4" + "node": ">=12" } }, "node_modules/postgres-bytea": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.0.tgz", - "integrity": "sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w==", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz", + "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==", + "license": "MIT", + "dependencies": { + "obuf": "~1.1.2" + }, "engines": { - "node": ">=0.10.0" + "node": ">= 6" } }, "node_modules/postgres-date": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz", - "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-2.1.0.tgz", + "integrity": "sha512-K7Juri8gtgXVcDfZttFKVmhglp7epKb1K4pgrkLxehjqkrgPhfG6OO8LHLkfaqkbpjNRnra018XwAr1yQFWGcA==", + "license": "MIT", "engines": { - "node": ">=0.10.0" + "node": ">=12" } }, "node_modules/postgres-interval": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz", - "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==", - "dependencies": { - "xtend": "^4.0.0" - }, + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-3.0.0.tgz", + "integrity": "sha512-BSNDnbyZCXSxgA+1f5UU2GmwhoI0aU5yMxRGO8CdFEcY2BQF9xm/7MqKnYoM1nJDk8nONNWDk9WeSmePFhQdlw==", + "license": "MIT", "engines": { - "node": ">=0.10.0" + "node": ">=12" } }, + "node_modules/postgres-range": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/postgres-range/-/postgres-range-1.1.4.tgz", + "integrity": "sha512-i/hbxIE9803Alj/6ytL7UHQxRvZkI9O4Sy+J3HGc4F4oo/2eQAjTSNJ0bfxyse3bH0nuVesCk+3IRLaMtG3H6w==", + "license": "MIT" + }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "license": "MIT" + }, "node_modules/ws": { "version": "8.17.1", "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz", @@ -114,14 +156,6 @@ "optional": true } } - }, - "node_modules/xtend": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", - "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", - "engines": { - "node": ">=0.4" - } } } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json index 3ae7a8a6cf..f791d184c5 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package.json @@ -1,7 +1,7 @@ { "type": "module", "dependencies": { - "@neondatabase/serverless": "0.9.0", + "@neondatabase/serverless": "0.9.4", "ws": "8.17.1" } } diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index 7e40081aa2..f83b44a7ad 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -20,7 +20,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): } ) - pageserver_http.configure_failpoints(("flush-frozen-pausable", "sleep(10000)")) + failpoint = "flush-frozen-pausable" + + pageserver_http.configure_failpoints((failpoint, "sleep(10000)")) endpoint_branch0 = env.endpoints.create_start("main", tenant_id=tenant) branch0_cur = endpoint_branch0.connect().cursor() @@ -96,3 +98,5 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): assert query_scalar(branch1_cur, "SELECT count(*) FROM foo") == 200000 assert query_scalar(branch2_cur, "SELECT count(*) FROM foo") == 300000 + + pageserver_http.configure_failpoints((failpoint, "off")) diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index f4667a82dc..a7eda73d4c 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -7,7 +7,7 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) -from fixtures.pageserver.http import PageserverApiException, TenantConfig +from fixtures.pageserver.http import TenantConfig from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import wait_until @@ -21,8 +21,6 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: [ # eviction might be the first one after an attach to access the layers ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction", - # detach can happen before we get to validate the generation number - ".*deletion backend: Dropped remote consistent LSN updates for tenant.*", ] ) assert isinstance(env.pageserver_remote_storage, LocalFsStorage) @@ -58,10 +56,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N env.pageserver.allowed_errors.extend( [ - # This fixture detaches the tenant, and tests using it will tend to re-attach it - # shortly after. There may be un-processed deletion_queue validations from the - # initial attachment - ".*Dropped remote consistent LSN updates.*", # This fixture is for tests that will intentionally generate 400 responses ".*Error processing HTTP request: Bad request", ] @@ -82,8 +76,8 @@ def test_null_body(negative_env: NegativeTests): tenant_id = negative_env.tenant_id ps_http = env.pageserver.http_client() - res = ps_http.post( - f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", + res = ps_http.put( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config", data=b"null", headers={"Content-Type": "application/json"}, ) @@ -99,35 +93,16 @@ def test_null_config(negative_env: NegativeTests): tenant_id = negative_env.tenant_id ps_http = env.pageserver.http_client() - res = ps_http.post( - f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", - data=b'{"config": null}', + res = ps_http.put( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config", + json={"mode": "AttachedSingle", "generation": 1, "tenant_conf": None}, headers={"Content-Type": "application/json"}, ) assert res.status_code == 400 -def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests): - """ - If we send a config with unknown keys, the request should be rejected with status 400. - """ - - env = negative_env.neon_env - tenant_id = negative_env.tenant_id - - config_with_unknown_keys = { - "compaction_period": "1h", - "this_key_does_not_exist": "some value", - } - - with pytest.raises(PageserverApiException) as e: - env.pageserver.tenant_attach(tenant_id, config=config_with_unknown_keys) - assert e.type == PageserverApiException - assert e.value.status_code == 400 - - @pytest.mark.parametrize("content_type", [None, "application/json"]) -def test_no_config(positive_env: NeonEnv, content_type: Optional[str]): +def test_empty_config(positive_env: NeonEnv, content_type: Optional[str]): """ When the 'config' body attribute is omitted, the request should be accepted and the tenant should use the default configuration @@ -141,11 +116,13 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]): ps_http.tenant_detach(tenant_id) assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()] - body = {"generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)} - - ps_http.post( - f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", - json=body, + ps_http.put( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config", + json={ + "mode": "AttachedSingle", + "generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id), + "tenant_conf": {}, + }, headers=None if content_type else {"Content-Type": "application/json"}, ).raise_for_status() @@ -191,7 +168,6 @@ def test_fully_custom_config(positive_env: NeonEnv): "refill_amount": 1000, "max": 1000, }, - "trace_read_requests": True, "walreceiver_connect_timeout": "13m", "image_layer_creation_check_threshold": 1, "switch_aux_file_policy": "cross-validation", diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 035ab2796f..7cb85e3dd1 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): def check_pageserver(expect_success: bool, **conn_kwargs): check_connection( env.pageserver, - f"get_last_record_rlsn {env.initial_tenant} {timeline_id}", + f"pagestream {env.initial_tenant} {env.initial_timeline}", expect_success, **conn_kwargs, ) diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index 82a3a05c2b..392b73c1f7 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -10,7 +10,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder @pytest.mark.timeout(600) def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*simulated connection error.*") + env.pageserver.allowed_errors.append(".*simulated connection error.*") # this is never hit + + # the real reason (Simulated Connection Error) is on the next line, and we cannot filter this out. + env.pageserver.allowed_errors.append( + ".*ERROR error in page_service connection task: Postgres query error" + ) # Enable failpoint before starting everything else up so that we exercise the retry # on fetching basebackup @@ -69,3 +74,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): cur.fetchall() times_executed += 1 log.info(f"Workload executed {times_executed} times") + + # do a graceful shutdown which would had caught the allowed_errors before + # https://github.com/neondatabase/neon/pull/8632 + env.pageserver.stop() diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index eb503ddbfa..f2e3855c12 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -65,8 +65,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): "compaction_period": "1 s", "compaction_threshold": "2", "image_creation_threshold": "1", - # set PITR interval to be small, so we can do GC - "pitr_interval": "1 s", + # Disable PITR, this test will set an explicit space-based GC limit + "pitr_interval": "0 s", } ) diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 03d6946c15..fc74707639 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -18,7 +18,6 @@ from fixtures.pageserver.utils import wait_until_tenant_active from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException -from requests.exceptions import RetryError # Test branch creation @@ -151,7 +150,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.pageserver.allowed_errors.extend( [ ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", - ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading", + ".*page_service_conn_main.*: query handler for 'basebackup .* ERROR: Not found: Timeline", ] ) ps_http = env.pageserver.http_client() @@ -176,10 +175,12 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline) - with pytest.raises(RuntimeError, match="is not active, state: Loading"): - env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant) + with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"): + env.endpoints.create_start( + initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2 + ) + ps_http.configure_failpoints(("before-upload-index-pausable", "off")) finally: - # FIXME: paused uploads bother shutdown env.pageserver.stop(immediate=True) t.join() @@ -193,8 +194,11 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.append( - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*" + env.pageserver.allowed_errors.extend( + [ + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: .*Cannot branch off the timeline that's not present in pageserver.*", + ] ) ps_http = env.pageserver.http_client() @@ -216,7 +220,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder branch_id = TimelineId.generate() - with pytest.raises(RetryError, match="too many 503 error responses"): + with pytest.raises( + PageserverApiException, + match="Cannot branch off the timeline that's not present in pageserver", + ): ps_http.timeline_create( env.pg_version, env.initial_tenant, @@ -389,6 +396,11 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder): repeat_result = ps_http.timeline_create( env.pg_version, env.initial_tenant, success_timeline, timeout=60 ) + # remote_consistent_lsn_visible will be published only after we've + # confirmed the generation, which is not part of what we await during + # timeline creation (uploads). mask it out here to avoid flakyness. + del success_result["remote_consistent_lsn_visible"] + del repeat_result["remote_consistent_lsn_visible"] assert repeat_result == success_result finally: env.pageserver.stop(immediate=True) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 61afd820ca..5ec9a22ba1 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -17,22 +17,17 @@ from fixtures.pg_version import PgVersion # Test restarting page server, while safekeeper and compute node keep # running. def test_local_corruption(neon_env_builder: NeonEnvBuilder): - if neon_env_builder.pageserver_get_impl == "vectored": - reconstruct_function_name = "get_values_reconstruct_data" - else: - reconstruct_function_name = "get_value_reconstruct_data" - env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend( [ - f".*{reconstruct_function_name} for layer .*", + ".*get_values_reconstruct_data for layer .*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", ".*failed to load metadata.*", ".*load failed.*load local timeline.*", - ".*layer loading failed permanently: load layer: .*", + ".*: layer load failed, assuming permanent failure:.*", ] ) @@ -79,7 +74,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err: + with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err: pg1.start() log.info( f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}" diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 97ab69049d..34791e5988 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -3,9 +3,16 @@ import asyncio from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.remote_storage import RemoteStorageKind +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response -def test_change_pageserver(neon_env_builder: NeonEnvBuilder): +def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver): + """ + A relatively low level test of reconfiguring a compute's pageserver at runtime. Usually this + is all done via the storage controller, but this test will disable the storage controller's compute + notifications, and instead update endpoints directly. + """ num_connections = 3 neon_env_builder.num_pageservers = 2 @@ -14,14 +21,24 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_start() - for pageserver in env.pageservers: - # This test dual-attaches a tenant, one of the pageservers will therefore - # be running with a stale generation. - pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + neon_env_builder.control_plane_compute_hook_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + ) + + def ignore_notify(request: Request): + # This test does direct updates to compute configuration: disable the storage controller's notification + log.info(f"Ignoring storage controller compute notification: {request.json}") + return Response(status=200) + + make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler( + ignore_notify + ) env.neon_cli.create_branch("test_change_pageserver") endpoint = env.endpoints.create_start("test_change_pageserver") + # Put this tenant into a dual-attached state + assert env.get_tenant_pageserver(env.initial_tenant) == env.pageservers[0] alt_pageserver_id = env.pageservers[1].id env.pageservers[1].tenant_attach(env.initial_tenant) @@ -77,6 +94,7 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): env.pageservers[ 0 ].stop() # Stop the old pageserver just to make sure we're reading from the new one + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) execute("SELECT count(*) FROM foo") assert fetchone() == (100000,) @@ -87,9 +105,10 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): # # Since we're dual-attached, need to tip-off storage controller to treat the one we're # about to start as the attached pageserver - env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id) env.pageservers[0].start() env.pageservers[1].stop() + env.storage_controller.node_configure(env.pageservers[1].id, {"availability": "Offline"}) + env.storage_controller.reconcile_until_idle() endpoint.reconfigure(pageserver_id=env.pageservers[0].id) @@ -97,10 +116,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): assert fetchone() == (100000,) env.pageservers[0].stop() - # Since we're dual-attached, need to tip-off storage controller to treat the one we're - # about to start as the attached pageserver - env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id) env.pageservers[1].start() + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + env.storage_controller.reconcile_until_idle() # Test a (former) bug where a child process spins without updating its connection string # by executing a query separately. This query will hang until we issue the reconfigure. diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py new file mode 100644 index 0000000000..41907b1f20 --- /dev/null +++ b/test_runner/regress/test_combocid.py @@ -0,0 +1,153 @@ +from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver + + +def do_combocid_op(neon_env_builder: NeonEnvBuilder, op): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers='1MB'", + ], + ) + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 1000 + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("create table t(id integer, val integer)") + + cur.execute("begin") + cur.execute("insert into t values (1, 0)") + cur.execute("insert into t values (2, 0)") + cur.execute(f"insert into t select g, 0 from generate_series(3,{n_records}) g") + + # Open a cursor that scroll it halfway through + cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t") + cur.execute("fetch 500 from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + # Perform specified operation + cur.execute(op) + + # Clear the cache, so that we exercise reconstructing the pages + # from WAL + endpoint.clear_shared_buffers() + + # Check that the cursor opened earlier still works. If the + # combocids are not restored correctly, it won't. + cur.execute("fetch all from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + cur.execute("rollback") + flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline) + env.pageserver.http_client().timeline_checkpoint( + env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True + ) + + +def test_combocid_delete(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "delete from t") + + +def test_combocid_update(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "update t set val=val+1") + + +def test_combocid_lock(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "select * from t for update") + + +def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers='1MB'", + ], + ) + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 1000 + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("create table t(id integer, val integer)") + file_path = f"{endpoint.pg_data_dir_path()}/t.csv" + cur.execute(f"insert into t select g, 0 from generate_series(1,{n_records}) g") + cur.execute(f"copy t to '{file_path}'") + cur.execute("truncate table t") + + cur.execute("begin") + cur.execute(f"copy t from '{file_path}'") + + # Open a cursor that scroll it halfway through + cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t") + cur.execute("fetch 500 from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + # Delete all the rows. Because all of the rows were inserted earlier in the + # same transaction, all the rows will get a combocid. + cur.execute("delete from t") + # Clear the cache, so that we exercise reconstructing the pages + # from WAL + endpoint.clear_shared_buffers() + + # Check that the cursor opened earlier still works. If the + # combocids are not restored correctly, it won't. + cur.execute("fetch all from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + cur.execute("rollback") + + flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline) + env.pageserver.http_client().timeline_checkpoint( + env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True + ) + + +def test_combocid(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 100000 + + cur.execute("create table t(id integer, val integer)") + cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)") + + cur.execute("begin") + + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + + cur.execute("delete from t") + assert cur.rowcount == n_records + cur.execute("delete from t") + assert cur.rowcount == 0 + + cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)") + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + + cur.execute("rollback") + + flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline) + env.pageserver.http_client().timeline_checkpoint( + env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True + ) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 49dcb9b86a..be787e0642 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -1,12 +1,17 @@ import enum import json import os +import time from typing import Optional import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + generate_uploads_and_deletions, +) from fixtures.pageserver.http import PageserverApiException +from fixtures.utils import wait_until from fixtures.workload import Workload AGGRESIVE_COMPACTION_TENANT_CONF = { @@ -140,6 +145,10 @@ def test_sharding_compaction( "image_layer_creation_check_threshold": 0, } + # Disable compression, as we can't estimate the size of layers with compression enabled + # TODO: implement eager layer cutting during compaction + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, @@ -257,3 +266,148 @@ def test_uploads_and_deletions( found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors) if not found_allowed_error: raise Exception("None of the allowed_errors occured in the log") + + +def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder): + """ + Check that repeated failures in compaction result in a circuit breaker breaking + """ + TENANT_CONF = { + # Very frequent runs to rack up failures quickly + "compaction_period": "100ms", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024 * 128, + # Compact small layers + "compaction_target_size": 1024 * 128, + "image_creation_threshold": 1, + } + + FAILPOINT = "delta-layer-writer-fail-before-finish" + BROKEN_LOG = ".*Circuit breaker broken!.*" + + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + workload = Workload(env, env.initial_tenant, env.initial_timeline) + workload.init() + + # Set a failpoint that will prevent compaction succeeding + env.pageserver.http_client().configure_failpoints((FAILPOINT, "return")) + + # Write some data to trigger compaction + workload.write_rows(1024, upload=False) + workload.write_rows(1024, upload=False) + workload.write_rows(1024, upload=False) + + def assert_broken(): + env.pageserver.assert_log_contains(BROKEN_LOG) + assert ( + env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total") + or 0 + ) == 1 + assert ( + env.pageserver.http_client().get_metric_value( + "pageserver_circuit_breaker_unbroken_total" + ) + or 0 + ) == 0 + + # Wait for enough failures to break the circuit breaker + # This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s + wait_until(60, 1, assert_broken) + + # Sleep for a while, during which time we expect that compaction will _not_ be retried + time.sleep(10) + + assert ( + env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total") + or 0 + ) == 1 + assert ( + env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_unbroken_total") + or 0 + ) == 0 + assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*") + + +@pytest.mark.parametrize("enabled", [True, False]) +def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool): + tenant_conf = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers as eagerly as possible + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + } + + # Explicitly enable/disable compression, rather than using default + if enabled: + neon_env_builder.pageserver_config_override = "image_compression='zstd'" + else: + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageserver = env.pageserver + ps_http = env.pageserver.http_client() + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + # Generate around 800k worth of easily compressible data to store + for v in range(100): + endpoint.safe_psql( + f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))" + ) + # run compaction to create image layers + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + + layer_map = ps_http.layer_map_info(tenant_id, timeline_id) + image_layer_count = 0 + delta_layer_count = 0 + for layer in layer_map.historic_layers: + if layer.kind == "Image": + image_layer_count += 1 + elif layer.kind == "Delta": + delta_layer_count += 1 + assert image_layer_count > 0 + assert delta_layer_count > 0 + + log.info(f"images: {image_layer_count}, deltas: {delta_layer_count}") + + bytes_in = pageserver.http_client().get_metric_value( + "pageserver_compression_image_in_bytes_total" + ) + bytes_out = pageserver.http_client().get_metric_value( + "pageserver_compression_image_out_bytes_total" + ) + assert bytes_in is not None + assert bytes_out is not None + log.info(f"Compression ratio: {bytes_out/bytes_in} ({bytes_out} in, {bytes_out} out)") + + if enabled: + # We are writing high compressible repetitive plain text, expect excellent compression + EXPECT_RATIO = 0.2 + assert bytes_out / bytes_in < EXPECT_RATIO + else: + # Nothing should be compressed if we disabled it. + assert bytes_out >= bytes_in + + # Destroy the endpoint and create a new one to resetthe caches + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + for v in range(100): + res = endpoint.safe_psql( + f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)" + ) + assert res[0][0] == 1 diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 65649e0c0a..c361efe90a 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -3,26 +3,27 @@ import re import shutil import subprocess import tempfile +from dataclasses import dataclass from pathlib import Path from typing import List, Optional import pytest import toml -from fixtures.common_types import Lsn +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgBin, + flush_ep_to_pageserver, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( timeline_delete_wait_completed, - wait_for_last_record_lsn, - wait_for_upload, ) from fixtures.pg_version import PgVersion -from fixtures.remote_storage import RemoteStorageKind +from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage +from fixtures.workload import Workload # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. @@ -41,7 +42,7 @@ from fixtures.remote_storage import RemoteStorageKind # # How to run `test_backward_compatibility` locally: # -# export DEFAULT_PG_VERSION=15 +# export DEFAULT_PG_VERSION=16 # export BUILD_TYPE=release # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} @@ -63,7 +64,7 @@ from fixtures.remote_storage import RemoteStorageKind # # How to run `test_forward_compatibility` locally: # -# export DEFAULT_PG_VERSION=15 +# export DEFAULT_PG_VERSION=16 # export BUILD_TYPE=release # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} @@ -124,11 +125,9 @@ def test_create_snapshot( timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] pageserver_http = env.pageserver.http_client() - lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) + flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) env.endpoints.stop_all() for sk in env.safekeepers: @@ -227,12 +226,6 @@ def test_forward_compatibility( ) try: - # Previous version neon_local and pageserver are not aware - # of the new config. - # TODO: remove these once the previous version of neon local supports them - neon_env_builder.pageserver_get_impl = None - neon_env_builder.pageserver_validate_vectored_get = None - neon_env_builder.num_safekeepers = 3 # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.). @@ -308,7 +301,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r pg_version = env.pg_version # Stop endpoint while we recreate timeline - ep.stop() + flush_ep_to_pageserver(env, ep, tenant_id, timeline_id) try: pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id) @@ -356,6 +349,11 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r assert not dump_from_wal_differs, "dump from WAL differs" assert not initial_dump_differs, "initial dump differs" + flush_ep_to_pageserver(env, ep, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint( + tenant_id, timeline_id, compact=False, wait_until_uploaded=True + ) + def dump_differs( first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None @@ -415,3 +413,132 @@ def dump_differs( break return differs + + +@dataclass +class HistoricDataSet: + name: str + tenant_id: TenantId + pg_version: PgVersion + url: str + + def __str__(self): + return self.name + + +HISTORIC_DATA_SETS = [ + # From before we enabled image layer compression. + # - IndexPart::LATEST_VERSION 7 + # - STORAGE_FORMAT_VERSION 3 + HistoricDataSet( + "2024-07-18", + TenantId("17bf64a53509714687664b3a84e9b3ba"), + PgVersion.V16, + "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst", + ), +] + + +@pytest.mark.parametrize("dataset", HISTORIC_DATA_SETS) +@pytest.mark.xdist_group("compatibility") +def test_historic_storage_formats( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, + pg_version: PgVersion, + dataset: HistoricDataSet, +): + """ + This test is like test_backward_compatibility, but it looks back further to examples of our storage format from long ago. + """ + + ARTIFACT_CACHE_DIR = "./artifact_cache" + + import tarfile + from contextlib import closing + + import requests + import zstandard + + artifact_unpack_path = ARTIFACT_CACHE_DIR / Path("unpacked") / Path(dataset.name) + + # Note: we assume that when running across a matrix of PG versions, the matrix includes all the versions needed by + # HISTORIC_DATA_SETS. If we ever remove a PG version from the matrix, then historic datasets built using that version + # will no longer be covered by this test. + if pg_version != dataset.pg_version: + pytest.skip(f"Dataset {dataset} is for different PG version, skipping") + + with closing(requests.get(dataset.url, stream=True)) as r: + unzstd = zstandard.ZstdDecompressor() + with unzstd.stream_reader(r.raw) as stream: + with tarfile.open(mode="r|", fileobj=stream) as tf: + tf.extractall(artifact_unpack_path) + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.pg_version = dataset.pg_version + env = neon_env_builder.init_configs() + env.start() + assert isinstance(env.pageserver_remote_storage, S3Storage) + + # Link artifact data into test's remote storage. We don't want the whole repo dir, just the remote storage part: we are not testing + # compat of local disk data across releases (test_backward_compat does that), we're testing really long-lived data in S3 like layer files and indices. + # + # The code generating the snapshot uses local_fs, but this test uses S3Storage, so we are copying a tree of files into a bucket. We use + # S3Storage so that the scrubber can run (the scrubber doesn't speak local_fs) + artifact_pageserver_path = ( + artifact_unpack_path / Path("repo") / Path("local_fs_remote_storage") / Path("pageserver") + ) + for root, _dirs, files in os.walk(artifact_pageserver_path): + for file in files: + local_path = os.path.join(root, file) + remote_key = ( + env.pageserver_remote_storage.prefix_in_bucket + + str(local_path)[len(str(artifact_pageserver_path)) :] + ) + log.info(f"Uploading {local_path} -> {remote_key}") + env.pageserver_remote_storage.client.upload_file( + local_path, env.pageserver_remote_storage.bucket_name, remote_key + ) + + # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt) + # + # Do this _before_ importing to the pageserver, as that import may start writing immediately + healthy, metadata_summary = env.storage_scrubber.scan_metadata() + assert healthy + assert metadata_summary["tenant_count"] >= 1 + assert metadata_summary["timeline_count"] >= 1 + + env.neon_cli.import_tenant(dataset.tenant_id) + + # Discover timelines + timelines = env.pageserver.http_client().timeline_list(dataset.tenant_id) + # All our artifacts should contain at least one timeline + assert len(timelines) > 0 + + # TODO: ensure that the snapshots we're importing contain a sensible variety of content, at the very + # least they should include a mixture of deltas and image layers. Preferably they should also + # contain some "exotic" stuff like aux files from logical replication. + + # Check we can start an endpoint and read the SQL that the artifact is meant to contain + reference_sql_dump = artifact_unpack_path / Path("dump.sql") + ep = env.endpoints.create_start("main", tenant_id=dataset.tenant_id) + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) + pg_bin.run_capture( + ["pg_dumpall", f"--dbname={ep.connstr()}", f"--file={test_output_dir / 'dump.sql'}"] + ) + assert not dump_differs( + reference_sql_dump, + test_output_dir / "dump.sql", + test_output_dir / "dump.filediff", + ) + ep.stop() + + # Check we can also do writes to the database + existing_timeline_id = TimelineId(timelines[0]["timeline_id"]) + workload = Workload(env, dataset.tenant_id, existing_timeline_id) + workload.init() + workload.write_rows(100) + + # Check that compaction works + env.pageserver.http_client().timeline_compact( + dataset.tenant_id, existing_timeline_id, force_image_layer_creation=True + ) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 7722828c79..85616c3fe2 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -21,6 +21,10 @@ from fixtures.utils import human_bytes, wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" +# access times in the pageserver are stored at a very low resolution: to generate meaningfully different +# values, tests must inject sleeps +ATIME_RESOLUTION = 2 + @pytest.mark.parametrize("config_level_override", [None, 400]) def test_min_resident_size_override_handling( @@ -67,14 +71,11 @@ def test_min_resident_size_override_handling( @enum.unique class EvictionOrder(str, enum.Enum): - ABSOLUTE_ORDER = "absolute" RELATIVE_ORDER_EQUAL = "relative_equal" RELATIVE_ORDER_SPARE = "relative_spare" def config(self) -> Dict[str, Any]: - if self == EvictionOrder.ABSOLUTE_ORDER: - return {"type": "AbsoluteAccessed"} - elif self == EvictionOrder.RELATIVE_ORDER_EQUAL: + if self == EvictionOrder.RELATIVE_ORDER_EQUAL: return { "type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}, @@ -230,6 +231,9 @@ def _eviction_env( neon_env_builder.num_pageservers = num_pageservers neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + # Disable compression support for EvictionEnv to get larger layer sizes + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + # initial tenant will not be present on this pageserver env = neon_env_builder.init_configs() env.start() @@ -381,7 +385,7 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_evicts_until_pressure_is_relieved( eviction_env: EvictionEnv, order: EvictionOrder @@ -415,7 +419,7 @@ def test_pageserver_evicts_until_pressure_is_relieved( @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_respects_overridden_resident_size( eviction_env: EvictionEnv, order: EvictionOrder @@ -492,7 +496,7 @@ def test_pageserver_respects_overridden_resident_size( @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder): """ @@ -523,7 +527,6 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E @pytest.mark.parametrize( "order", [ - EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE, ], @@ -547,6 +550,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): (tenant_id, timeline_id) = warm # make picked tenant more recently used than the other one + time.sleep(ATIME_RESOLUTION) env.warm_up_tenant(tenant_id) # Build up enough pressure to require evictions from both tenants, @@ -569,63 +573,38 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): later_tenant_usage < du_by_timeline[tenant] ), "all tenants should have lost some layers" - warm_size = later_du_by_timeline[warm] - cold_size = later_du_by_timeline[cold] + # with relative order what matters is the amount of layers, with a + # fudge factor of whether the eviction bothers tenants with highest + # layer count the most. last accessed times between tenants does not + # matter. + assert order in [EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE] + layers_now = env.count_layers_per_tenant(env.pageserver) - if order == EvictionOrder.ABSOLUTE_ORDER: - # bounds for warmed_size - warm_lower = 0.5 * du_by_timeline[warm] + expected_ratio = later_total_on_disk / total_on_disk + log.info( + f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio" + ) - # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. - # So, check for up to 3 here. - warm_upper = warm_lower + 3 * env.layer_size + for tenant_id, original_count in tenant_layers.items(): + count_now = layers_now[tenant_id] + ratio = count_now / original_count + abs_diff = abs(ratio - expected_ratio) + assert original_count > count_now - cold_upper = 2 * env.layer_size - log.info(f"tenants: warm={warm[0]}, cold={cold[0]}") + expectation = 0.065 log.info( - f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" + f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" ) - log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") - - assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" - assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" - - assert ( - cold_size < cold_upper - ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" - else: - # with relative order what matters is the amount of layers, with a - # fudge factor of whether the eviction bothers tenants with highest - # layer count the most. last accessed times between tenants does not - # matter. - layers_now = env.count_layers_per_tenant(env.pageserver) - - expected_ratio = later_total_on_disk / total_on_disk - log.info( - f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio" - ) - - for tenant_id, original_count in tenant_layers.items(): - count_now = layers_now[tenant_id] - ratio = count_now / original_count - abs_diff = abs(ratio - expected_ratio) - assert original_count > count_now - - expectation = 0.06 - log.info( - f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" - ) - # in this test case both relative_spare and relative_equal produce - # the same outcomes; this must be a quantization effect of similar - # sizes (-s4 and -s6) and small (5MB) layer size. - # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 - assert abs_diff < expectation + # in this test case both relative_spare and relative_equal produce + # the same outcomes; this must be a quantization effect of similar + # sizes (-s4 and -s6) and small (5MB) layer size. + # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 + assert abs_diff < expectation @pytest.mark.parametrize( "order", [ - EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE, ], @@ -648,6 +627,10 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or for scale in [1, 1, 1, 4]: timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale)) + # Eviction times are stored at a low resolution. We must ensure that the time between + # tenants is long enough for the pageserver to distinguish them. + time.sleep(ATIME_RESOLUTION) + env.neon_cli.safekeeper_stop() for (tenant_id, timeline_id), scale in timelines: @@ -677,14 +660,7 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order" log.info(f"{ratios}") - if order == EvictionOrder.ABSOLUTE_ORDER: - # first tenant loses most - assert ratios[0] <= ratios[1], "first should lose the most" - assert ratios[1] < ratios[2], "second should lose some" - assert ratios[1] < 1.0 - assert ratios[2] <= ratios[3], "third might not lose" - assert ratios[3] == 1.0, "tenant created last does not lose" - elif order == EvictionOrder.RELATIVE_ORDER_EQUAL: + if order == EvictionOrder.RELATIVE_ORDER_EQUAL: assert all([x for x in ratios if x < 1.0]), "all tenants lose layers" elif order == EvictionOrder.RELATIVE_ORDER_SPARE: # with different layer sizes and pg versions, there are different combinations @@ -747,7 +723,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): "type": "Failure", "mocked_error": "EIO", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO") @@ -781,7 +757,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) wait_until( @@ -794,6 +770,16 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): wait_until(2, 2, less_than_max_usage_pct) + # Disk usage candidate collection only takes into account active tenants. + # However, the statvfs call takes into account the entire tenants directory, + # which includes tenants which haven't become active yet. + # + # After re-starting the pageserver, disk usage eviction may kick in *before* + # both tenants have become active. Hence, the logic will try to satisfy the + # disk usage requirements by evicting everything belonging to the active tenant, + # and hence violating the tenant minimum resident size. + env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) + def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): """ @@ -824,7 +810,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) wait_until( diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py new file mode 100644 index 0000000000..ae3dded437 --- /dev/null +++ b/test_runner/regress/test_endpoint_crash.py @@ -0,0 +1,23 @@ +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.parametrize( + "sql_func", + [ + "trigger_panic", + "trigger_segfault", + "💣", # calls `trigger_segfault` internally + ], +) +def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str): + """ + Test that triggering crash from neon_test_utils crashes the endpoint + """ + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_endpoint_crash") + endpoint = env.endpoints.create_start("test_endpoint_crash") + + endpoint.safe_psql("CREATE EXTENSION neon_test_utils;") + with pytest.raises(Exception, match="This probably means the server terminated abnormally"): + endpoint.safe_psql(f"SELECT {sql_func}();") diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 8edc8c554c..ae63136abb 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -168,7 +168,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): # re-execute the query, it will make GetPage # requests. This does not clear the last-written LSN cache # so we still remember the LSNs of the pages. - s_cur.execute("SELECT clear_buffer_cache()") + secondary.clear_shared_buffers(cursor=s_cur) if pause_apply: s_cur.execute("SELECT pg_wal_replay_pause()") @@ -332,6 +332,7 @@ def test_replica_query_race(neon_simple_env: NeonEnv): log.info(f"read {reads}: counter {readcounter}, last update {writecounter}") reads += 1 + # FIXME: what about LFC clearing? await conn.execute("SELECT clear_buffer_cache()") async def both(): diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index ac27a4cf36..4385cfca76 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -18,7 +18,6 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, - wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture @@ -76,7 +75,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] - endpoint_id = "ep-import_from_vanilla" + branch_name = "import_from_vanilla" tenant = TenantId.generate() timeline = TimelineId.generate() @@ -88,7 +87,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build env.pageserver.allowed_errors.extend( [ - ".*error importing base backup .*", + ".*Failed to import basebackup.*", + ".*unexpected non-zero bytes after the tar archive.*", ".*Timeline got dropped without initializing, cleaning its files.*", ".*InternalServerError.*timeline not found.*", ".*InternalServerError.*Tenant .* not found.*", @@ -106,8 +106,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build str(tenant), "--timeline-id", str(timeline), - "--node-name", - endpoint_id, + "--branch-name", + branch_name, "--base-lsn", start_lsn, "--base-tarfile", @@ -143,10 +143,10 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build # Wait for data to land in s3 wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn)) - wait_for_upload(client, tenant, timeline, Lsn(end_lsn)) + client.timeline_checkpoint(tenant, timeline, compact=False, wait_until_uploaded=True) # Check it worked - endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant) + endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant) assert endpoint.safe_psql("select count(*) from t") == [(300000,)] vanilla_pg.stop() @@ -265,7 +265,7 @@ def _import( tenant = TenantId.generate() # Import to pageserver - endpoint_id = "ep-import_from_pageserver" + branch_name = "import_from_pageserver" client = env.pageserver.http_client() env.pageserver.tenant_create(tenant) env.neon_cli.raw_cli( @@ -276,8 +276,8 @@ def _import( str(tenant), "--timeline-id", str(timeline), - "--node-name", - endpoint_id, + "--branch-name", + branch_name, "--base-lsn", str(lsn), "--base-tarfile", @@ -289,10 +289,10 @@ def _import( # Wait for data to land in s3 wait_for_last_record_lsn(client, tenant, timeline, lsn) - wait_for_upload(client, tenant, timeline, lsn) + client.timeline_checkpoint(tenant, timeline, compact=False, wait_until_uploaded=True) # Check it worked - endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant, lsn=lsn) + endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant, lsn=lsn) assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] # Take another fullbackup diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py index 77dc8a35b5..b8126395fd 100644 --- a/test_runner/regress/test_layer_bloating.py +++ b/test_runner/regress/test_layer_bloating.py @@ -1,27 +1,31 @@ import os -import time import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( - NeonEnv, + NeonEnvBuilder, logical_replication_sync, wait_for_last_flush_lsn, ) from fixtures.pg_version import PgVersion -def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg): - env = neon_simple_env - - if env.pg_version != PgVersion.V16: +def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg): + if neon_env_builder.pg_version != PgVersion.V16: pytest.skip("pg_log_standby_snapshot() function is available only in PG16") - timeline = env.neon_cli.create_branch("test_logical_replication", "empty") - endpoint = env.endpoints.create_start( - "test_logical_replication", config_lines=["log_statement=all"] + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "0s", + "compaction_period": "0s", + "compaction_threshold": 99999, + "image_creation_threshold": 99999, + } ) + timeline = env.initial_timeline + endpoint = env.endpoints.create_start("main", config_lines=["log_statement=all"]) + pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -54,7 +58,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg): # Wait logical replication to sync logical_replication_sync(vanilla_pg, endpoint) wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline) - time.sleep(10) + env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, timeline, compact=False) # Check layer file sizes timeline_path = f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{timeline}/" @@ -63,3 +67,5 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg): if filename.startswith("00000"): log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}") assert os.path.getsize(timeline_path + filename) < 512_000_000 + + env.stop(immediate=True) diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 54d3b2d515..3b2218dd9b 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -39,9 +39,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 2a3442448a..1b2c7f808f 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -1,3 +1,7 @@ +import os +import random +import re +import subprocess import threading import time @@ -17,17 +21,17 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): "test_lfc_resize", config_lines=[ "neon.file_cache_path='file.cache'", - "neon.max_file_cache_size=1GB", - "neon.file_cache_size_limit=1GB", + "neon.max_file_cache_size=512MB", + "neon.file_cache_size_limit=512MB", ], ) n_resize = 10 - scale = 10 + scale = 100 def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) - pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr]) + pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr]) thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) thread.start() @@ -35,9 +39,21 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): conn = endpoint.connect() cur = conn.cursor() - for i in range(n_resize): - cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'") + for _ in range(n_resize): + size = random.randint(1, 512) + cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'") cur.execute("select pg_reload_conf()") time.sleep(1) + cur.execute("alter system set neon.file_cache_size_limit='100MB'") + cur.execute("select pg_reload_conf()") + thread.join() + + lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache" + lfc_file_size = os.path.getsize(lfc_file_path) + res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True) + lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] + log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") + assert lfc_file_size <= 512 * 1024 * 1024 + assert int(lfc_file_blocks) <= 128 * 1024 diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index a6f05fe0f7..4c53e4e2fd 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -1,3 +1,4 @@ +import time from pathlib import Path from fixtures.log_helper import log @@ -72,3 +73,46 @@ WITH (fillfactor='100'); blocks = query_scalar(cur, "select approximate_working_set_size(true)") log.info(f"working set size after some index access of a few select pages only {blocks}") assert blocks < 10 + + +def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): + env = neon_simple_env + + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=256MB", + "neon.file_cache_size_limit=245MB", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("create extension neon") + cur.execute( + "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))" + ) + cur.execute("insert into t (pk) values (generate_series(1,1000000))") + time.sleep(2) + before_10k = time.monotonic() + cur.execute("select sum(count) from t where pk between 10000 and 20000") + time.sleep(2) + before_1k = time.monotonic() + cur.execute("select sum(count) from t where pk between 1000 and 2000") + after = time.monotonic() + + cur.execute(f"select approximate_working_set_size_seconds({int(after - before_1k + 1)})") + estimation_1k = cur.fetchall()[0][0] + log.info(f"Working set size for selecting 1k records {estimation_1k}") + + cur.execute(f"select approximate_working_set_size_seconds({int(after - before_10k + 1)})") + estimation_10k = cur.fetchall()[0][0] + log.info(f"Working set size for selecting 10k records {estimation_10k}") + + cur.execute("select pg_table_size('t')") + size = cur.fetchall()[0][0] // 8192 + log.info(f"Table size {size} blocks") + + assert estimation_1k >= 20 and estimation_1k <= 40 + assert estimation_10k >= 200 and estimation_10k <= 400 diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 76c6581448..3c404c3b23 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -1,4 +1,5 @@ import os +import queue import random import threading import time @@ -8,11 +9,7 @@ from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder from fixtures.utils import query_scalar -def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str): - if build_type == "debug": - # Disable vectored read path cross validation since it makes the test time out. - neon_env_builder.pageserver_config_override = "validate_vectored_get=false" - +def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() cache_dir = os.path.join(env.repo_dir, "file_cache") @@ -33,11 +30,10 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: s cur = endpoint.connect().cursor() + stop = threading.Event() n_rows = 100000 n_threads = 20 - n_updates_per_thread = 10000 n_updates_per_connection = 1000 - n_total_updates = n_threads * n_updates_per_thread cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)") cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g") @@ -48,11 +44,11 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: s # performed (plus the initial 1 on each row). # # Furthermore, each thread will reconnect between every 1000 updates. - def run_updates(): + def run_updates(n_updates_performed_q: queue.Queue[int]): n_updates_performed = 0 conn = endpoint.connect() cur = conn.cursor() - for _ in range(n_updates_per_thread): + while not stop.is_set(): id = random.randint(1, n_rows) cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}") n_updates_performed += 1 @@ -61,19 +57,28 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: s conn.close() conn = endpoint.connect() cur = conn.cursor() + n_updates_performed_q.put(n_updates_performed) + n_updates_performed_q: queue.Queue[int] = queue.Queue() threads: List[threading.Thread] = [] for _i in range(n_threads): - thread = threading.Thread(target=run_updates, args=(), daemon=True) + thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True) thread.start() threads.append(thread) time.sleep(5) + # unlink, this is what we're actually testing new_cache_dir = os.path.join(env.repo_dir, "file_cache_new") os.rename(cache_dir, new_cache_dir) + time.sleep(10) + + stop.set() + + n_updates_performed = 0 for thread in threads: thread.join() + n_updates_performed += n_updates_performed_q.get() - assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_total_updates + n_rows + assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index ca3c81d6e5..0d18aa43b7 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -10,10 +10,11 @@ from fixtures.neon_fixtures import ( AuxFileStore, NeonEnv, NeonEnvBuilder, + PgProtocol, logical_replication_sync, wait_for_last_flush_lsn, ) -from fixtures.utils import query_scalar, wait_until +from fixtures.utils import wait_until def random_string(n: int): @@ -248,6 +249,27 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of cur.execute( "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')" ) + # do the peek second time: we've had a bug using wrong memory context + # for NeonWALReader leading to the crash in this case. + log.info("peek_changes again") + cur.execute( + "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')" + ) + cur.execute( + """ +INSERT INTO wal_generator (data) +SELECT repeat('A', 1024) -- Generates a kilobyte of data per row +FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data +""" + ) + + endpoint.stop_and_destroy() + endpoint = env.endpoints.create_start("init") + with endpoint.connect().cursor() as cur: + log.info("advance slot") + cur.execute( + "SELECT * from pg_replication_slot_advance('slotty_mcslotface', pg_current_wal_lsn())" + ) # Tests that walsender correctly blocks until WAL is downloaded from safekeepers @@ -326,12 +348,17 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of assert "could not receive data from WAL stream" not in logs -# Test compute start at LSN page of which starts with contrecord -# https://github.com/neondatabase/neon/issues/5749 +# Test replication of WAL record spanning page boundary (with contrecord) after +# compute restart and WAL write of the page. +# +# See https://github.com/neondatabase/neon/issues/5749 +# +# Most pages start with a contrecord, so we don't do anything special +# to ensure that. @pytest.mark.parametrize( "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] ) -def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): +def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env env.neon_cli.create_branch("init") @@ -356,52 +383,6 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): logical_replication_sync(vanilla_pg, endpoint) vanilla_pg.stop() - with endpoint.cursor() as cur: - # measure how much space logical message takes. Sometimes first attempt - # creates huge message and then it stabilizes, have no idea why. - for _ in range(3): - lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"current_lsn={lsn_before}") - # Non-transactional logical message doesn't write WAL, only XLogInsert's - # it, so use transactional. Which is a bit problematic as transactional - # necessitates commit record. Alternatively we can do smth like - # select neon_xlogflush(pg_current_wal_insert_lsn()); - # but isn't much better + that particular call complains on 'xlog flush - # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips - # page headers. - payload = "blahblah" - cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')") - lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before - logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload) - log.info( - f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}" - ) - - # and write logical message spanning exactly as we want - lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"current_lsn={lsn_before}") - curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - offs = int(curr_lsn) % 8192 - till_page = 8192 - offs - payload_len = ( - till_page - logical_message_base - 8 - ) # not sure why 8 is here, it is deduced from experiments - log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}") - - # payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer - payload_len += 8 - - cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')") - supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"supposedly_page_boundary={supposedly_contrecord_end}") - # The calculations to hit the page boundary are very fuzzy, so just - # ignore test if we fail to reach it. - if not (int(supposedly_contrecord_end) % 8192 == 32): - pytest.skip("missed page boundary, bad luck") - - cur.execute("insert into replication_example values (2, 3)") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) endpoint.stop().start() @@ -560,3 +541,90 @@ def test_replication_shutdown(neon_simple_env: NeonEnv): assert [r[0] for r in res] == [10, 20, 30, 40] wait_until(10, 0.5, check_that_changes_propagated) + + +def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn: + """ + Wait for logical replication subscriber reported flush_lsn to reach + pg_current_wal_flush_lsn on publisher. Note that this is somewhat unreliable + because for some WAL records like vacuum subscriber won't get any data at + all. + """ + publisher_flush_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + + def check_caughtup(): + res = publisher.safe_psql( + """ +select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication sr, pg_replication_slots s + where s.active_pid = sr.pid and s.slot_type = 'logical'; + """ + )[0] + sent_lsn, flush_lsn, curr_publisher_flush_lsn = Lsn(res[0]), Lsn(res[1]), Lsn(res[2]) + log.info( + f"sent_lsn={sent_lsn}, flush_lsn={flush_lsn}, publisher_flush_lsn={curr_publisher_flush_lsn}, waiting flush_lsn to reach {publisher_flush_lsn}" + ) + assert flush_lsn >= publisher_flush_lsn + + wait_until(30, 0.5, check_caughtup) + return publisher_flush_lsn + + +# Test that subscriber takes into account quorum committed flush_lsn in +# flush_lsn reporting to publisher. Without this, it may ack too far, losing +# data on restart because publisher advances START_REPLICATION position to the +# confirmed_flush_lsn of the slot. +def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg): + env = neon_simple_env + # use vanilla as publisher to allow writes on it when safekeeper is down + vanilla_pg.configure( + [ + "wal_level = 'logical'", + # neon fork uses custom WAL records which won't work without extension installed with obscure + # ERROR: resource manager with ID 134 not registered + # error. + "shared_preload_libraries = 'neon'", + ] + ) + vanilla_pg.start() + vanilla_pg.safe_psql("create extension neon;") + + env.neon_cli.create_branch("subscriber") + sub = env.endpoints.create("subscriber") + sub.start() + + with vanilla_pg.cursor() as pcur: + with sub.cursor() as scur: + pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + pcur.execute("CREATE PUBLICATION pub FOR TABLE t") + scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + + pub_connstr = vanilla_pg.connstr().replace("'", "''") + log.info(f"pub connstr is {pub_connstr}, subscriber connstr {sub.connstr()}") + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_connstr}' PUBLICATION pub with (synchronous_commit=off)" + scur.execute(query) + time.sleep(2) # let initial table sync complete + + # stop safekeeper so it won't get any data + for sk in env.safekeepers: + sk.stop() + # and insert to publisher + with vanilla_pg.cursor() as pcur: + for i in range(0, 1000): + pcur.execute("INSERT into t values (%s, random()*100000)", (i,)) + # wait until sub receives all data + logical_replication_sync(sub, vanilla_pg) + # Update confirmed_flush_lsn of the slot. If subscriber ack'ed recevied data + # as flushed we'll now lose it if subscriber restars. That's why + # logical_replication_wait_flush_lsn_sync is expected to hang while + # safekeeper is down. + vanilla_pg.safe_psql("checkpoint;") + assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000 + + # restart subscriber and ensure it can catch up lost tail again + sub.stop(mode="immediate") + for sk in env.safekeepers: + sk.start() + sub.start() + log.info("waiting for sync after restart") + logical_replication_wait_flush_lsn_sync(vanilla_pg) + assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000 diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 263730a823..67e82f8d30 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -12,10 +12,24 @@ from fixtures.utils import query_scalar, wait_until from requests.exceptions import ReadTimeout -# -# Test pageserver get_lsn_by_timestamp API -# -def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): +def assert_lsn_lease_granted(result, with_lease: bool): + """ + Asserts an LSN lease is granted when `with_lease` flag is turned on. + Always asserts no LSN lease is granted when `with_lease` flag is off. + """ + if with_lease: + assert result.get("valid_until") + else: + assert result.get("valid_until") is None + + +@pytest.mark.parametrize("with_lease", [True, False]) +def test_lsn_mapping(neon_env_builder: NeonEnvBuilder, with_lease: bool): + """ + Test pageserver get_lsn_by_timestamp API. + + :param with_lease: Whether to get a lease associated with returned LSN. + """ env = neon_env_builder.init_start() tenant_id, _ = env.neon_cli.create_tenant( @@ -67,23 +81,33 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Check edge cases # Timestamp is in the future probe_timestamp = tbl[-1][1] + timedelta(hours=1) - result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp) + result = client.timeline_get_lsn_by_timestamp( + tenant_id, timeline_id, probe_timestamp, with_lease=with_lease + ) assert result["kind"] == "future" + assert_lsn_lease_granted(result, with_lease) # make sure that we return a well advanced lsn here assert Lsn(result["lsn"]) > start_lsn # Timestamp is in the unreachable past probe_timestamp = tbl[0][1] - timedelta(hours=10) - result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp) + result = client.timeline_get_lsn_by_timestamp( + tenant_id, timeline_id, probe_timestamp, with_lease=with_lease + ) assert result["kind"] == "past" + assert_lsn_lease_granted(result, with_lease) + # make sure that we return the minimum lsn here at the start of the range assert Lsn(result["lsn"]) < start_lsn # Probe a bunch of timestamps in the valid range for i in range(1, len(tbl), 100): probe_timestamp = tbl[i][1] - result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp) + result = client.timeline_get_lsn_by_timestamp( + tenant_id, timeline_id, probe_timestamp, with_lease=with_lease + ) assert result["kind"] not in ["past", "nodata"] + assert_lsn_lease_granted(result, with_lease) lsn = result["lsn"] # Call get_lsn_by_timestamp to get the LSN # Launch a new read-only node at that LSN, and check that only the rows @@ -105,8 +129,11 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Timestamp is in the unreachable past probe_timestamp = tbl[0][1] - timedelta(hours=10) - result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id_child, probe_timestamp) + result = client.timeline_get_lsn_by_timestamp( + tenant_id, timeline_id_child, probe_timestamp, with_lease=with_lease + ) assert result["kind"] == "past" + assert_lsn_lease_granted(result, with_lease) # make sure that we return the minimum lsn here at the start of the range assert Lsn(result["lsn"]) >= last_flush_lsn diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index 5637f160cf..bdc5ca907e 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -1,6 +1,10 @@ -import time +from __future__ import annotations -from fixtures.neon_fixtures import NeonEnv +import time +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv def test_migrations(neon_simple_env: NeonEnv): @@ -11,17 +15,14 @@ def test_migrations(neon_simple_env: NeonEnv): endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() - endpoint.wait_for_migrations() - - num_migrations = 9 + num_migrations = 10 + endpoint.wait_for_migrations(num_migrations=num_migrations) with endpoint.cursor() as cur: cur.execute("SELECT id FROM neon_migration.migration_id") migration_id = cur.fetchall() assert migration_id[0][0] == num_migrations - endpoint.assert_log_contains(f"INFO handle_migrations: Ran {num_migrations} migrations") - endpoint.stop() endpoint.start() # We don't have a good way of knowing that the migrations code path finished executing @@ -31,5 +32,3 @@ def test_migrations(neon_simple_env: NeonEnv): cur.execute("SELECT id FROM neon_migration.migration_id") migration_id = cur.fetchall() assert migration_id[0][0] == num_migrations - - endpoint.assert_log_contains("INFO handle_migrations: Ran 0 migrations") diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 39b4865026..bb844244e3 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -24,7 +24,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.3",) + assert cur.fetchone() == ("1.4",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") res = cur.fetchall() log.info(res) @@ -48,10 +48,10 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.3",) + assert cur.fetchone() == ("1.4",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") - all_versions = ["1.3", "1.2", "1.1", "1.0"] - current_version = "1.3" + all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"] + current_version = "1.4" for idx, begin_version in enumerate(all_versions): for target_version in all_versions[idx + 1 :]: if current_version != begin_version: diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index 98fb06a0d6..51e847135e 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -1,13 +1,15 @@ -import json import os import time from pathlib import Path -from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_wal_insert_lsn -from fixtures.pageserver.utils import ( - wait_for_last_record_lsn, +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + VanillaPostgres, + import_timeline_from_vanilla_postgres, + wait_for_wal_insert_lsn, ) from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import query_scalar @@ -76,7 +78,6 @@ def test_import_at_2bil( ): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - ps_http = env.pageserver.http_client() # Reset the vanilla Postgres instance to somewhat before 2 billion transactions. pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") @@ -92,68 +93,28 @@ def test_import_at_2bil( assert vanilla_pg.safe_psql("select count(*) from tt") == [(300000,)] vanilla_pg.safe_psql("CREATE TABLE t (t text);") vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')") - - endpoint_id = "ep-import_from_vanilla" - tenant = TenantId.generate() - timeline = TimelineId.generate() - - env.pageserver.tenant_create(tenant) - - # Take basebackup - basebackup_dir = os.path.join(test_output_dir, "basebackup") - base_tar = os.path.join(basebackup_dir, "base.tar") - wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") - os.mkdir(basebackup_dir) vanilla_pg.safe_psql("CHECKPOINT") - pg_bin.run( - [ - "pg_basebackup", - "-F", - "tar", - "-d", - vanilla_pg.connstr(), - "-D", - basebackup_dir, - ] + + tenant_id = TenantId.generate() + env.pageserver.tenant_create(tenant_id) + timeline_id = TimelineId.generate() + + # Import the cluster to Neon + import_timeline_from_vanilla_postgres( + test_output_dir, + env, + pg_bin, + tenant_id, + timeline_id, + "imported_2bil_xids", + vanilla_pg.connstr(), ) + vanilla_pg.stop() # don't need the original server anymore - # Get start_lsn and end_lsn - with open(os.path.join(basebackup_dir, "backup_manifest")) as f: - manifest = json.load(f) - start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] - end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] - - def import_tar(base, wal): - env.neon_cli.raw_cli( - [ - "timeline", - "import", - "--tenant-id", - str(tenant), - "--timeline-id", - str(timeline), - "--node-name", - endpoint_id, - "--base-lsn", - start_lsn, - "--base-tarfile", - base, - "--end-lsn", - end_lsn, - "--wal-tarfile", - wal, - "--pg-version", - env.pg_version, - ] - ) - - # Importing correct backup works - import_tar(base_tar, wal_tar) - wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn)) - + # Check that it works endpoint = env.endpoints.create_start( - endpoint_id, - tenant_id=tenant, + "imported_2bil_xids", + tenant_id=tenant_id, config_lines=[ "log_autovacuum_min_duration = 0", "autovacuum_naptime='5 s'", @@ -161,7 +122,6 @@ def test_import_at_2bil( ) assert endpoint.safe_psql("select count(*) from t") == [(1,)] - # Ok, consume conn = endpoint.connect() cur = conn.cursor() @@ -213,7 +173,7 @@ def test_import_at_2bil( cur.execute("checkpoint") # wait until pageserver receives that data - wait_for_wal_insert_lsn(env, endpoint, tenant, timeline) + wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id) # Restart endpoint endpoint.stop() @@ -223,3 +183,275 @@ def test_import_at_2bil( cur = conn.cursor() cur.execute("SELECT count(*) from t") assert cur.fetchone() == (10000 + 1 + 1,) + + +# Constants and macros copied from PostgreSQL multixact.c and headers. These are needed to +# calculate the SLRU segments that a particular multixid or multixid-offsets falls into. +BLCKSZ = 8192 +MULTIXACT_OFFSETS_PER_PAGE = int(BLCKSZ / 4) +SLRU_PAGES_PER_SEGMENT = int(32) +MXACT_MEMBER_BITS_PER_XACT = 8 +MXACT_MEMBER_FLAGS_PER_BYTE = 1 +MULTIXACT_FLAGBYTES_PER_GROUP = 4 +MULTIXACT_MEMBERS_PER_MEMBERGROUP = MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE +MULTIXACT_MEMBERGROUP_SIZE = 4 * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP +MULTIXACT_MEMBERGROUPS_PER_PAGE = int(BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +MULTIXACT_MEMBERS_PER_PAGE = MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP + + +def MultiXactIdToOffsetSegment(xid: int): + return int(xid / (SLRU_PAGES_PER_SEGMENT * MULTIXACT_OFFSETS_PER_PAGE)) + + +def MXOffsetToMemberSegment(off: int): + return int(off / (SLRU_PAGES_PER_SEGMENT * MULTIXACT_MEMBERS_PER_PAGE)) + + +def advance_multixid_to( + pg_bin: PgBin, vanilla_pg: VanillaPostgres, next_multi_xid: int, next_multi_offset: int +): + """ + Use pg_resetwal to advance the nextMulti and nextMultiOffset values in a stand-alone + Postgres cluster. This is useful to get close to wraparound or some other interesting + value, without having to burn a lot of time consuming the (multi-)XIDs one by one. + + The new values should be higher than the old ones, in a wraparound-aware sense. + + On entry, the server should be running. It will be shut down and restarted. + """ + + # Read old values from the last checkpoint. We will pass the old oldestMultiXid value + # back to pg_resetwal, there's no option to leave it alone. + with vanilla_pg.connect() as conn: + with conn.cursor() as cur: + # Make sure the oldest-multi-xid value in the control file is up-to-date + cur.execute("checkpoint") + cur.execute("select oldest_multi_xid, next_multixact_id from pg_control_checkpoint()") + rec = cur.fetchone() + assert rec is not None + (ckpt_oldest_multi_xid, ckpt_next_multi_xid) = rec + log.info(f"oldestMultiXid was {ckpt_oldest_multi_xid}, nextMultiXid was {ckpt_next_multi_xid}") + log.info(f"Resetting to {next_multi_xid}") + + # Use pg_resetwal to reset the next multiXid and multiOffset to given values. + vanilla_pg.stop() + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") + cmd = [ + pg_resetwal_path, + f"--multixact-ids={next_multi_xid},{ckpt_oldest_multi_xid}", + f"--multixact-offset={next_multi_offset}", + "-D", + str(vanilla_pg.pgdatadir), + ] + pg_bin.run_capture(cmd) + + # Because we skip over a lot of values, Postgres hasn't created the SLRU segments for + # the new values yet. Create them manually, to allow Postgres to start up. + # + # This leaves "gaps" in the SLRU where segments between old value and new value are + # missing. That's OK for our purposes. Autovacuum will print some warnings about the + # missing segments, but will clean it up by truncating the SLRUs up to the new value, + # closing the gap. + segname = "%04X" % MultiXactIdToOffsetSegment(next_multi_xid) + log.info(f"Creating dummy segment pg_multixact/offsets/{segname}") + with open(vanilla_pg.pgdatadir / "pg_multixact" / "offsets" / segname, "w") as of: + of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ) + of.flush() + + segname = "%04X" % MXOffsetToMemberSegment(next_multi_offset) + log.info(f"Creating dummy segment pg_multixact/members/{segname}") + with open(vanilla_pg.pgdatadir / "pg_multixact" / "members" / segname, "w") as of: + of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ) + of.flush() + + # Start Postgres again and wait until autovacuum has processed all the databases + # + # This allows truncating the SLRUs, fixing the gaps with missing segments. + vanilla_pg.start() + with vanilla_pg.connect().cursor() as cur: + for _ in range(1000): + datminmxid = int( + query_scalar(cur, "select min(datminmxid::text::int8) from pg_database") + ) + log.info(f"datminmxid {datminmxid}") + if next_multi_xid - datminmxid < 1_000_000: # not wraparound-aware! + break + time.sleep(0.5) + + +def test_multixid_wraparound_import( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, + pg_bin: PgBin, + vanilla_pg, +): + """ + Test that the wraparound of the "next-multi-xid" counter is handled correctly in + pageserver, And multi-offsets as well + """ + env = neon_env_builder.init_start() + + # In order to to test multixid wraparound, we need to first advance the counter to + # within spitting distance of the wraparound, that is 2^32 multi-XIDs. We could simply + # run a workload that consumes a lot of multi-XIDs until we approach that, but that + # takes a very long time. So we cheat. + # + # Our strategy is to create a vanilla Postgres cluster, and use pg_resetwal to + # directly set the multi-xid counter a higher value. However, we cannot directly set + # it to just before 2^32 (~ 4 billion), because that would make the exisitng + # 'relminmxid' values to look like they're in the future. It's not clear how the + # system would behave in that situation. So instead, we bump it up ~ 1 billion + # multi-XIDs at a time, and let autovacuum to process all the relations and update + # 'relminmxid' between each run. + # + # XXX: For the multi-offsets, most of the bump is done in the last call. This is + # because advancing it ~ 1 billion at a time hit a pathological case in the + # MultiXactMemberFreezeThreshold() function, causing autovacuum not trigger multixid + # freezing. See + # https://www.postgresql.org/message-id/85fb354c-f89f-4d47-b3a2-3cbd461c90a3%40iki.fi + # Multi-offsets don't have the same wraparound problems at 2 billion mark as + # multi-xids do, so one big jump is fine. + vanilla_pg.configure( + [ + "log_autovacuum_min_duration = 0", + # Perform anti-wraparound vacuuming aggressively + "autovacuum_naptime='1 s'", + "autovacuum_freeze_max_age = 1000000", + "autovacuum_multixact_freeze_max_age = 1000000", + ], + ) + vanilla_pg.start() + advance_multixid_to(pg_bin, vanilla_pg, 0x40000000, 0x10000000) + advance_multixid_to(pg_bin, vanilla_pg, 0x80000000, 0x20000000) + advance_multixid_to(pg_bin, vanilla_pg, 0xC0000000, 0x30000000) + advance_multixid_to(pg_bin, vanilla_pg, 0xFFFFFF00, 0xFFFFFF00) + + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + vanilla_pg.safe_psql("create table tt as select g as id from generate_series(1, 10) g") + vanilla_pg.safe_psql("CHECKPOINT") + + # Import the cluster to the pageserver + tenant_id = TenantId.generate() + env.pageserver.tenant_create(tenant_id) + timeline_id = TimelineId.generate() + import_timeline_from_vanilla_postgres( + test_output_dir, + env, + pg_bin, + tenant_id, + timeline_id, + "imported_multixid_wraparound_test", + vanilla_pg.connstr(), + ) + vanilla_pg.stop() + + endpoint = env.endpoints.create_start( + "imported_multixid_wraparound_test", + tenant_id=tenant_id, + config_lines=[ + "log_autovacuum_min_duration = 0", + "autovacuum_naptime='5 s'", + "autovacuum=off", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + assert query_scalar(cur, "select count(*) from tt") == 10 # sanity check + + # Install extension containing function needed for test + cur.execute("CREATE EXTENSION neon_test_utils") + + # Consume a lot of XIDs, just to advance the XIDs to different range than the + # multi-xids. That avoids confusion while debugging + cur.execute("select test_consume_xids(100000)") + cur.execute("select pg_switch_wal()") + cur.execute("checkpoint") + + # Use subtransactions so that each row in 'tt' is stamped with different XID. Leave + # the transaction open. + cur.execute("BEGIN") + cur.execute( + """ +do $$ +declare + idvar int; +begin + for idvar in select id from tt loop + begin + update tt set id = idvar where id = idvar; + exception when others then + raise 'didn''t expect an error: %', sqlerrm; + end; + end loop; +end; +$$; +""" + ) + + # In a different transaction, acquire a FOR KEY SHARE lock on each row. This generates + # a new multixid for each row, with the previous xmax and this transaction's XID as the + # members. + # + # Repeat this until the multi-xid counter wraps around. + conn3 = endpoint.connect() + cur3 = conn3.cursor() + next_multixact_id_before_restart = 0 + observed_before_wraparound = False + while True: + cur3.execute("BEGIN") + cur3.execute("SELECT * FROM tt FOR KEY SHARE") + + # Get the xmax of one of the rows we locked. It should be a multi-xid. It might + # not be the latest one, but close enough. + row_xmax = int(query_scalar(cur3, "SELECT xmax FROM tt LIMIT 1")) + cur3.execute("COMMIT") + log.info(f"observed a row with xmax {row_xmax}") + + # High value means not wrapped around yet + if row_xmax >= 0xFFFFFF00: + observed_before_wraparound = True + continue + + # xmax should not be a regular XID. (We bumped up the regular XID range earlier + # to around 100000 and above.) + assert row_xmax < 100 + + # xmax values < FirstNormalTransactionId (== 3) could be special XID values, or + # multixid values after wraparound. We don't know for sure which, so keep going to + # be sure we see value that's unambiguously a wrapped-around multixid + if row_xmax < 3: + continue + + next_multixact_id_before_restart = row_xmax + log.info( + f"next_multixact_id is now at {next_multixact_id_before_restart} or a little higher" + ) + break + + # We should have observed the state before wraparound + assert observed_before_wraparound + + cur.execute("COMMIT") + + # Wait until pageserver has received all the data, and restart the endpoint + wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id) + endpoint.stop(mode="immediate") # 'immediate' to avoid writing shutdown checkpoint + endpoint.start() + + # Check that the next-multixid value wrapped around correctly + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("select next_multixact_id from pg_control_checkpoint()") + next_multixact_id_after_restart = int( + query_scalar(cur, "select next_multixact_id from pg_control_checkpoint()") + ) + log.info(f"next_multixact_id after restart: {next_multixact_id_after_restart}") + assert next_multixact_id_after_restart >= next_multixact_id_before_restart + + # The multi-offset should wrap around as well + cur.execute("select next_multi_offset from pg_control_checkpoint()") + next_multi_offset_after_restart = int( + query_scalar(cur, "select next_multi_offset from pg_control_checkpoint()") + ) + log.info(f"next_multi_offset after restart: {next_multi_offset_after_restart}") + assert next_multi_offset_after_restart < 100000 diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py new file mode 100644 index 0000000000..e8eefc2414 --- /dev/null +++ b/test_runner/regress/test_oid_overflow.py @@ -0,0 +1,45 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_oid_overflow(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + endpoint = env.endpoints.create_start("main") + + conn = endpoint.connect() + cur = conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("CREATE TABLE t1(x integer)") + cur.execute("INSERT INTO t1 values (1)") + cur.execute("CREATE TABLE t2(x integer)") + cur.execute("INSERT INTO t2 values (2)") + + cur.execute("SELECT x from t1") + assert cur.fetchone() == (1,) + cur.execute("SELECT x from t2") + assert cur.fetchone() == (2,) + + cur.execute("VACUUM FULL t1") + cur.execute("VACUUM FULL t1") + cur.execute("vacuum pg_class") + cur.execute("SELECT relfilenode FROM pg_class where relname='t1'") + oid = cur.fetchall()[0][0] + log.info(f"t1.relfilenode={oid}") + + cur.execute("set statement_timeout=0") + cur.execute(f"select test_consume_oids({oid-1})") + cur.execute("VACUUM FULL t2") + + cur.execute("SELECT relfilenode FROM pg_class where relname='t2'") + oid = cur.fetchall()[0][0] + log.info(f"t2.relfilenode={oid}") + + endpoint.clear_shared_buffers(cursor=cur) + + cur.execute("SELECT x from t1") + assert cur.fetchone() == (1,) + cur.execute("SELECT x from t2") + assert cur.fetchone() == (2,) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 4a25dfd874..c8249bb2ce 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -764,7 +764,9 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder): """ Pause using a pausable_failpoint longer than the client timeout to simulate the timeout happening. """ - neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + # running this test is not reliable against REAL_S3, because operations can + # take longer than 1s we want to use as a timeout + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) assert isinstance(neon_env_builder.pageserver_remote_storage, S3Storage) neon_env_builder.pageserver_remote_storage.custom_timeout = "1s" diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index abbea59113..28dbf40bed 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -1,8 +1,5 @@ -import subprocess -from pathlib import Path from typing import Optional -import toml from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, @@ -13,67 +10,6 @@ from fixtures.pageserver.http import PageserverHttpClient from fixtures.utils import wait_until -def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path): - """ - NB: The neon_local doesn't use `--init` mode anymore, but our production - deployment still does => https://github.com/neondatabase/aws/pull/1322 - """ - workdir = neon_simple_env.pageserver.workdir - pageserver_config = workdir / "pageserver.toml" - pageserver_bin = neon_binpath / "pageserver" - - def run_pageserver(args): - return subprocess.run( - [str(pageserver_bin), "-D", str(workdir), *args], - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - neon_simple_env.pageserver.stop() - - with open(neon_simple_env.pageserver.config_toml_path, "r") as f: - ps_config = toml.load(f) - - required_config_keys = [ - "pg_distrib_dir", - "listen_pg_addr", - "listen_http_addr", - "pg_auth_type", - "http_auth_type", - # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748 - # "tenant_config", - ] - required_config_overrides = [ - f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys - ] - - pageserver_config.unlink() - - bad_init = run_pageserver(["--init", *required_config_overrides]) - assert ( - bad_init.returncode == 1 - ), "pageserver should not be able to init new config without the node id" - assert 'missing config value "id"' in bad_init.stderr - assert not pageserver_config.exists(), "config file should not be created after init error" - - good_init_cmd = [ - "--init", - f"--config-override=id={ps_config['id']}", - *required_config_overrides, - ] - completed_init = run_pageserver(good_init_cmd) - assert ( - completed_init.returncode == 0 - ), "pageserver should be able to create a new config with the node id given" - assert pageserver_config.exists(), "config file should be created successfully" - - bad_reinit = run_pageserver(good_init_cmd) - assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists" - assert "config file already exists" in bad_reinit.stderr - - def check_client(env: NeonEnv, client: PageserverHttpClient): pg_version = env.pg_version initial_tenant = env.initial_tenant @@ -85,8 +21,10 @@ def check_client(env: NeonEnv, client: PageserverHttpClient): # create new tenant and check it is also there tenant_id = TenantId.generate() - client.tenant_create( - tenant_id, generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id) + env.pageserver.tenant_create( + tenant_id, + generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id), + auth_token=client.auth_token, ) assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()} diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 696af24e5c..73af7950f1 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -22,7 +22,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgBin, - StorageScrubber, generate_uploads_and_deletions, ) from fixtures.pageserver.common_types import parse_layer_file_name @@ -215,12 +214,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = StorageScrubber(neon_env_builder).scan_metadata() + healthy, metadata_summary = env.storage_scrubber.scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 - assert not metadata_summary["with_errors"] - assert not metadata_summary["with_warnings"] + assert healthy def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): @@ -249,10 +247,6 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"] assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 - main_pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - # Now advance the generation in the control plane: subsequent validations # from the running pageserver will fail. No more deletions should happen. env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id) @@ -397,8 +391,6 @@ def test_deletion_queue_recovery( # validated before restart. assert get_deletion_queue_executed(ps_http) == before_restart_depth else: - main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) - # If we lost the attachment, we should have dropped our pre-restart deletions. assert get_deletion_queue_dropped(ps_http) == before_restart_depth @@ -553,13 +545,6 @@ def test_multi_attach( tenant_id = env.initial_tenant timeline_id = env.initial_timeline - # We will intentionally create situations where stale deletions happen from non-latest-generation - # nodes when the tenant is multiply-attached - for ps in env.pageservers: - ps.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) @@ -610,19 +595,26 @@ def test_multi_attach( for ps in pageservers: ps.stop() - # Returning to a normal healthy state: all pageservers will start, but only the one most - # recently attached via the control plane will re-attach on startup + # Returning to a normal healthy state: all pageservers will start for ps in pageservers: ps.start() - with pytest.raises(PageserverApiException): - _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) - with pytest.raises(PageserverApiException): - _detail = http_clients[1].timeline_detail(tenant_id, timeline_id) - _detail = http_clients[2].timeline_detail(tenant_id, timeline_id) + # Pageservers are marked offline by the storage controller during the rolling restart + # above. This may trigger a reschedulling, so there's no guarantee that the tenant + # shard ends up attached to the most recent ps. + raised = 0 + serving_ps_idx = None + for idx, http_client in enumerate(http_clients): + try: + _detail = http_client.timeline_detail(tenant_id, timeline_id) + serving_ps_idx = idx + except PageserverApiException: + raised += 1 + + assert raised == 2 and serving_ps_idx is not None # All data we wrote while multi-attached remains readable - workload.validate(pageservers[2].id) + workload.validate(pageservers[serving_ps_idx].id) def test_upgrade_generationless_local_file_paths( diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index cea35a6acb..24a37b04ec 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -58,7 +58,6 @@ def test_metric_collection( metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" metric_collection_bucket={remote_storage_to_toml_inline_table(neon_env_builder.pageserver_remote_storage)} - cached_metric_collection_interval="0s" synthetic_size_calculation_interval="3s" """ @@ -216,7 +215,6 @@ def test_metric_collection_cleans_up_tempfile( neon_env_builder.pageserver_config_override = f""" metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" - cached_metric_collection_interval="0s" synthetic_size_calculation_interval="3s" """ diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py index aecfcdd262..37ff923632 100644 --- a/test_runner/regress/test_pageserver_reconnect.py +++ b/test_runner/regress/test_pageserver_reconnect.py @@ -2,6 +2,7 @@ import threading import time from contextlib import closing +import psycopg2.errors from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, PgBin @@ -40,3 +41,26 @@ def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin): c.execute("select pg_reload_conf()") thread.join() + + +# Test handling errors during page server reconnect +def test_pageserver_reconnect_failure(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_pageserver_reconnect") + endpoint = env.endpoints.create_start("test_pageserver_reconnect") + + con = endpoint.connect() + cur = con.cursor() + + cur.execute("set statement_timeout='2s'") + cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'") + connstring = cur.fetchall()[0][0] + cur.execute( + f"alter system set neon.pageserver_connstring='{connstring}?some_invalid_param=xyz'" + ) + cur.execute("select pg_reload_conf()") + try: + cur.execute("select count(*) from pg_class") + except psycopg2.errors.QueryCanceled: + log.info("Connection to PS failed") + assert not endpoint.log_contains("ERROR: cannot wait on socket event without a socket.*") diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 4ce53df214..bbf82fea4c 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -13,7 +13,10 @@ from fixtures.utils import wait_until # running. def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() + + # We inject a delay of 15 seconds for tenant activation below. + # Hence, bump the max delay here to not skip over the activation. + neon_env_builder.pageserver_config_override = 'background_task_maximum_delay="20s"' env = neon_env_builder.init_start() @@ -70,7 +73,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # pageserver does if a compute node connects and sends a request for the tenant # while it's still in Loading state. (It waits for the loading to finish, and then # processes the request.) - tenant_load_delay_ms = 5000 + tenant_load_delay_ms = 15000 env.pageserver.stop() env.pageserver.start( extra_env_vars={"FAILPOINTS": f"before-attaching-tenant=return({tenant_load_delay_ms})"} @@ -156,8 +159,9 @@ def test_pageserver_chaos( if build_type == "debug": pytest.skip("times out in debug builds") + # same rationale as with the immediate stop; we might leave orphan layers behind. + neon_env_builder.disable_scrub_on_exit() neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() if shard_count is not None: neon_env_builder.num_pageservers = shard_count @@ -218,3 +222,11 @@ def test_pageserver_chaos( # Check that all the updates are visible num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0] assert num_updates == i * 100000 + + # currently pageserver cannot tolerate the fact that "s3" goes away, and if + # we succeeded in a compaction before shutdown, there might be a lot of + # uploads pending, certainly more than what we can ingest with MOCK_S3 + # + # so instead, do a fast shutdown for this one test. + # See https://github.com/neondatabase/neon/issues/8709 + env.stop(immediate=True) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 2782d33e15..8746b88a75 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -2,22 +2,23 @@ import json import os import random import time -from typing import Any, Dict, Optional +from pathlib import Path +from typing import Any, Dict, Optional, Union import pytest -from fixtures.common_types import TenantId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubber +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, - poll_for_remote_storage_iterations, - tenant_delete_wait_completed, wait_for_upload_queue_empty, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import wait_until from fixtures.workload import Workload +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response # A tenant configuration that is convenient for generating uploads and deletions # without a large amount of postgres traffic. @@ -61,7 +62,7 @@ def evict_random_layers( @pytest.mark.parametrize("seed", [1, 2, 3]) -def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): +def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, seed: int): """ Issue many location configuration changes, ensure that tenants remain readable & we don't get any unexpected errors. We should @@ -75,6 +76,20 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=s3_storage(), ) + neon_env_builder.control_plane_compute_hook_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + ) + + def ignore_notify(request: Request): + # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions), + # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute. + log.info(f"Ignoring storage controller compute notification: {request.json}") + return Response(status=200) + + make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler( + ignore_notify + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) pageservers = env.pageservers @@ -85,9 +100,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): for ps in env.pageservers: ps.allowed_errors.extend( [ - # We will make no effort to avoid stale attachments - ".*Dropped remote consistent LSN updates.*", - ".*Dropping stale deletions.*", # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found ".*query handler.*Tenant.*not found.*", # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active @@ -104,6 +116,20 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): workload.init(env.pageservers[0].id) workload.write_rows(256, env.pageservers[0].id) + # Discourage the storage controller from interfering with the changes we will make directly on the pageserver + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Stop", + }, + ) + env.storage_controller.allowed_errors.extend( + [ + ".*Scheduling is disabled by policy Stop.*", + ".*Skipping reconcile for policy Stop.*", + ] + ) + # We use a fixed seed to make the test reproducible: we want a randomly # chosen order, but not to change the order every time we run the test. rng = random.Random(seed) @@ -214,7 +240,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check # that the scrubber sees it and cleans it up. We do this before the final attach+validate pass, # to also validate that the scrubber isn't breaking anything. - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] > 0 @@ -363,8 +389,10 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): # Check that deletion works properly on a tenant that was live-migrated # (reproduce https://github.com/neondatabase/neon/issues/6802) - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations) + pageserver_b.http_client().tenant_delete(tenant_id) + + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): @@ -410,6 +438,35 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): validate_heatmap(heatmap_second) +def list_elegible_layers( + pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId +) -> list[Path]: + """ + The subset of layer filenames that are elegible for secondary download: at time of writing this + is all resident layers which are also visible. + """ + candidates = pageserver.list_layers(tenant_id, timeline_id) + + layer_map = pageserver.http_client().layer_map_info(tenant_id, timeline_id) + + # Map of layer filenames to their visibility the "layer name" is not the same as the filename: add suffix to resolve one to the other + visible_map = dict( + (f"{layer.layer_file_name}-v1-00000001", layer.visible) + for layer in layer_map.historic_layers + ) + + def is_visible(layer_file_name): + try: + return visible_map[str(layer_file_name)] + except KeyError: + # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map + # matches what's on disk. + log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}") + raise + + return list(c for c in candidates if is_visible(c)) + + def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): """ Test the overall data flow in secondary mode: @@ -464,7 +521,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + assert list_elegible_layers(ps_attached, tenant_id, timeline_id) == ps_secondary.list_layers( tenant_id, timeline_id ) @@ -482,9 +539,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) try: - assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( - tenant_id, timeline_id - ) + assert list_elegible_layers( + ps_attached, tenant_id, timeline_id + ) == ps_secondary.list_layers(tenant_id, timeline_id) except: # Do a full listing of the secondary location on errors, to help debug of # https://github.com/neondatabase/neon/issues/6966 @@ -505,8 +562,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # ================================================================== try: log.info("Evicting a layer...") - layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0] - some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1] + layer_to_evict = list_elegible_layers(ps_attached, tenant_id, timeline_id)[0] + some_other_layer = list_elegible_layers(ps_attached, tenant_id, timeline_id)[1] log.info(f"Victim layer: {layer_to_evict.name}") ps_attached.http_client().evict_layer( tenant_id, timeline_id, layer_name=layer_to_evict.name @@ -524,9 +581,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id) - assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( - tenant_id, timeline_id - ) + assert list_elegible_layers( + ps_attached, tenant_id, timeline_id + ) == ps_secondary.list_layers(tenant_id, timeline_id) except: # On assertion failures, log some details to help with debugging heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) @@ -536,7 +593,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - StorageScrubber(neon_env_builder).scan_metadata() + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy # Detach secondary and delete tenant # =================================== @@ -552,7 +610,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) log.info("Deleting tenant...") - tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10) + ps_attached.http_client().tenant_delete(tenant_id) assert_prefix_empty( neon_env_builder.pageserver_remote_storage, @@ -565,6 +623,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) workload.stop() + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 885a94a557..45ce5b1c5b 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -8,8 +8,11 @@ from typing import TYPE_CHECKING, cast import pytest from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, NeonEnvBuilder, check_restored_datadir_content, + tenant_get_shards, ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import s3_storage @@ -21,13 +24,104 @@ if TYPE_CHECKING: from pytest import CaptureFixture +TENANT_CONF = { + # Scaled down thresholds so that we are exercising the pageserver beyond just writing + # ephemeral/L0 layers, and because debug-mode code is slow to read from full sized ephemeral layer files. + "pitr_interval": "60s", + "checkpoint_distance": f"{8 * 1024 * 1024}", + "compaction_target_size": f"{8 * 1024 * 1024}", +} + +# # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create. +# # There should have been compactions mid-test as well, this final check is in addition those. +# for (shard, pageserver) in tenant_get_shards(env, env.initial_tenant): +# pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline, force_repartition=True, force_image_layer_creation=True) + + +def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: Endpoint): + """ + After running some opaque tests that create interesting content in a timeline, run + some generic integrity checks that the storage stack is able to reproduce the written + data properly. + """ + + ignored_files: Optional[list[str]] = None + + # Neon handles unlogged relations in a special manner. During a + # basebackup, we ship the init fork as the main fork. This presents a + # problem in that the endpoint's data directory and the basebackup will + # have differences and will fail the eventual file comparison. + # + # Unlogged tables were introduced in version 9.1. ALTER TABLE grew + # support for setting the persistence of a table in 9.5. The reason that + # this doesn't affect versions < 15 (but probably would between 9.1 and + # 9.5) is that all the regression tests that deal with unlogged tables + # up until that point dropped the unlogged tables or set them to logged + # at some point during the test. + # + # In version 15, Postgres grew support for unlogged sequences, and with + # that came a few more regression tests. These tests did not all drop + # the unlogged tables/sequences prior to finishing. + # + # But unlogged sequences came with a bug in that, sequences didn't + # inherit the persistence of their "parent" tables if they had one. This + # was fixed and backported to 15, thus exacerbating our problem a bit. + # + # So what we can do is just ignore file differences between the data + # directory and basebackup for unlogged relations. + results = cast( + "list[tuple[str, str]]", + endpoint.safe_psql( + """ + SELECT + relkind, + pg_relation_filepath( + pg_filenode_relation(reltablespace, relfilenode) + ) AS unlogged_relation_paths + FROM pg_class + WHERE relpersistence = 'u' + """, + dbname=db_name, + ), + ) + + unlogged_relation_files: list[str] = [] + for r in results: + unlogged_relation_files.append(r[1]) + # This is related to the following Postgres commit: + # + # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b + # Author: Heikki Linnakangas + # Date: 2023-08-23 09:21:31 -0500 + # + # Use the buffer cache when initializing an unlogged index. + # + # This patch was backpatched to 16. Without it, the LSN in the + # page header would be 0/0 in the data directory, which wouldn't + # match the LSN generated during the basebackup, thus creating + # a difference. + if env.pg_version <= PgVersion.V15 and r[0] == "i": + unlogged_relation_files.append(f"{r[1]}_init") + + ignored_files = unlogged_relation_files + + check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) + + # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create. + # There should have been compactions mid-test as well, this final check is in addition those. + for shard, pageserver in tenant_get_shards(env, env.initial_tenant): + pageserver.http_client().timeline_checkpoint( + shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True + ) + + # Run the main PostgreSQL regression tests, in src/test/regress. # +@pytest.mark.timeout(900) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) def test_pg_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - build_type: str, pg_bin: PgBin, capsys: CaptureFixture[str], base_dir: Path, @@ -43,16 +137,20 @@ def test_pg_regress( if shard_count is not None: neon_env_builder.num_pageservers = shard_count - if build_type == "debug": - # Disable vectored read path cross validation since it makes the test time out. - neon_env_builder.pageserver_config_override = "validate_vectored_get=false" - neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + ) # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("main") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], + ) endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. @@ -88,71 +186,12 @@ def test_pg_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - ignored_files: Optional[list[str]] = None - - # Neon handles unlogged relations in a special manner. During a - # basebackup, we ship the init fork as the main fork. This presents a - # problem in that the endpoint's data directory and the basebackup will - # have differences and will fail the eventual file comparison. - # - # Unlogged tables were introduced in version 9.1. ALTER TABLE grew - # support for setting the persistence of a table in 9.5. The reason that - # this doesn't affect versions < 15 (but probably would between 9.1 and - # 9.5) is that all the regression tests that deal with unlogged tables - # up until that point dropped the unlogged tables or set them to logged - # at some point during the test. - # - # In version 15, Postgres grew support for unlogged sequences, and with - # that came a few more regression tests. These tests did not all drop - # the unlogged tables/sequences prior to finishing. - # - # But unlogged sequences came with a bug in that, sequences didn't - # inherit the persistence of their "parent" tables if they had one. This - # was fixed and backported to 15, thus exacerbating our problem a bit. - # - # So what we can do is just ignore file differences between the data - # directory and basebackup for unlogged relations. - results = cast( - "list[tuple[str, str]]", - endpoint.safe_psql( - """ - SELECT - relkind, - pg_relation_filepath( - pg_filenode_relation(reltablespace, relfilenode) - ) AS unlogged_relation_paths - FROM pg_class - WHERE relpersistence = 'u' - """, - dbname=DBNAME, - ), - ) - - unlogged_relation_files: list[str] = [] - for r in results: - unlogged_relation_files.append(r[1]) - # This is related to the following Postgres commit: - # - # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b - # Author: Heikki Linnakangas - # Date: 2023-08-23 09:21:31 -0500 - # - # Use the buffer cache when initializing an unlogged index. - # - # This patch was backpatched to 16. Without it, the LSN in the - # page header would be 0/0 in the data directory, which wouldn't - # match the LSN generated during the basebackup, thus creating - # a difference. - if env.pg_version <= PgVersion.V15 and r[0] == "i": - unlogged_relation_files.append(f"{r[1]}_init") - - ignored_files = unlogged_relation_files - - check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) + post_checks(env, test_output_dir, DBNAME, endpoint) # Run the PostgreSQL "isolation" tests, in src/test/isolation. # +@pytest.mark.timeout(600) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) def test_isolation( neon_env_builder: NeonEnvBuilder, @@ -163,16 +202,26 @@ def test_isolation( pg_distrib_dir: Path, shard_count: Optional[int], ): + DBNAME = "isolation_regression" + if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + ) # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them - endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"]) - endpoint.safe_psql("CREATE DATABASE isolation_regression") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "max_prepared_transactions=100", + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], + ) + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_isolation_regress to run in. runpath = test_output_dir / "regress" @@ -206,6 +255,9 @@ def test_isolation( with capsys.disabled(): pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) + # This fails with a mismatch on `pg_multixact/offsets/0000` + # post_checks(env, test_output_dir, DBNAME, endpoint) + # Run extra Neon-specific pg_regress-based tests. The tests and their # schedule file are in the sql_regress/ directory. @@ -219,15 +271,24 @@ def test_sql_regress( pg_distrib_dir: Path, shard_count: Optional[int], ): + DBNAME = "regression" + if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + ) # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("main") - endpoint.safe_psql("CREATE DATABASE regression") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], + ) + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. runpath = test_output_dir / "regress" @@ -262,4 +323,4 @@ def test_sql_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - check_restored_datadir_content(test_output_dir, env, endpoint) + post_checks(env, test_output_dir, DBNAME, endpoint) diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py index 034f2b669d..043aff686b 100644 --- a/test_runner/regress/test_physical_replication.py +++ b/test_runner/regress/test_physical_replication.py @@ -1,12 +1,15 @@ +from __future__ import annotations + import random import time +from typing import TYPE_CHECKING -from fixtures.neon_fixtures import NeonEnv +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv def test_physical_replication(neon_simple_env: NeonEnv): env = neon_simple_env - n_records = 100000 with env.endpoints.create_start( branch_name="main", endpoint_id="primary", @@ -22,8 +25,20 @@ def test_physical_replication(neon_simple_env: NeonEnv): with p_con.cursor() as p_cur: with secondary.connect() as s_con: with s_con.cursor() as s_cur: - for pk in range(n_records): + runtime_secs = 30 + started_at = time.time() + pk = 0 + while True: + pk += 1 + now = time.time() + if now - started_at > runtime_secs: + break p_cur.execute("insert into t (pk) values (%s)", (pk,)) + # an earlier version of this test was based on a fixed number of loop iterations + # and selected for pk=(random.randrange(1, fixed number of loop iterations)). + # => the probability of selection for a value that was never inserted changed from 99.9999% to 0% over the course of the test. + # + # We changed the test to where=(random.randrange(1, 2*pk)), which means the probability is now fixed to 50%. s_cur.execute( - "select * from t where pk=%s", (random.randrange(1, n_records),) + "select * from t where pk=%s", (random.randrange(1, 2 * pk),) ) diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index f446f4f200..d2b8c2ed8b 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -2,6 +2,7 @@ import asyncio import json import subprocess import time +import urllib.parse from typing import Any, List, Optional, Tuple import psycopg2 @@ -275,6 +276,31 @@ def test_sql_over_http(static_proxy: NeonProxy): assert res["rowCount"] is None +def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy): + db = "db with spaces" + static_proxy.safe_psql_many( + ( + f'create database "{db}"', + "create role http with login password 'http' superuser", + ) + ) + + def q(sql: str, params: Optional[List[Any]] = None) -> Any: + params = params or [] + connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/{urllib.parse.quote(db)}" + response = requests.post( + f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + data=json.dumps({"query": sql, "params": params}), + headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr}, + verify=str(static_proxy.test_output_dir / "proxy.crt"), + ) + assert response.status_code == 200, response.text + return response.json() + + rows = q("select 42 as answer")["rows"] + assert rows == [{"answer": 42}] + + def test_sql_over_http_output_options(static_proxy: NeonProxy): static_proxy.safe_psql("create role http2 with login password 'http2' superuser") diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py deleted file mode 100644 index cc5853b727..0000000000 --- a/test_runner/regress/test_read_trace.py +++ /dev/null @@ -1,39 +0,0 @@ -from contextlib import closing - -from fixtures.common_types import Lsn -from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.utils import query_scalar - - -# This test demonstrates how to collect a read trace. It's useful until -# it gets replaced by a test that actually does stuff with the trace. -# -# Additionally, tests that pageserver is able to create tenants with custom configs. -def test_read_request_tracing(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 - env = neon_env_builder.init_start( - initial_tenant_conf={ - "trace_read_requests": "true", - } - ) - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - endpoint = env.endpoints.create_start("main") - - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("create table t (i integer);") - cur.execute(f"insert into t values (generate_series(1,{10000}));") - cur.execute("select count(*) from t;") - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - # wait until pageserver receives that data - pageserver_http = env.pageserver.http_client() - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) - - # Stop postgres so we drop the connection and flush the traces - endpoint.stop() - - trace_path = env.pageserver.workdir / "traces" / str(tenant_id) / str(timeline_id) - assert trace_path.exists() diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 2437c8f806..d128c60a99 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -61,7 +61,7 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Clear buffer cache to ensure no stale pages are brought into the cache") - c.execute("select clear_buffer_cache()") + endpoint.clear_shared_buffers(cursor=c) cache_entries = query_scalar( c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 7f79bf5d5c..2e5260ca78 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -12,7 +12,6 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, wait_for_last_flush_lsn, ) -from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( timeline_delete_wait_completed, @@ -164,13 +163,14 @@ def test_remote_storage_backup_and_restore( "data": {"reason": "storage-sync-list-remote-timelines"}, } + # Even though the tenant is broken, subsequent calls to location_conf API will succeed, but + # the tenant will always end up in a broken state as a result of the failpoint. # Ensure that even though the tenant is broken, retrying the attachment fails - with pytest.raises(Exception, match="Tenant state is Broken"): - # Use same generation as in previous attempt - gen_state = env.storage_controller.inspect(tenant_id) - assert gen_state is not None - generation = gen_state[0] - env.pageserver.tenant_attach(tenant_id, generation=generation) + tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) + gen_state = env.storage_controller.inspect(tenant_id) + assert gen_state is not None + generation = gen_state[0] + env.pageserver.tenant_attach(tenant_id, generation=generation) # Restart again, this implicitly clears the failpoint. # test_remote_failures=1 remains active, though, as it's in the pageserver config. @@ -312,6 +312,7 @@ def test_remote_storage_upload_queue_retries( def churn_while_failpoints_active(result): overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c") + # this call will wait for the failpoints to be turned off client.timeline_checkpoint(tenant_id, timeline_id) client.timeline_compact(tenant_id, timeline_id) overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d") @@ -331,8 +332,8 @@ def test_remote_storage_upload_queue_retries( # Exponential back-off in upload queue, so, gracious timeouts. wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0)) - wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2)) - wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0)) + wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1)) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # unblock churn operations configure_storage_sync_failpoints("off") @@ -354,13 +355,6 @@ def test_remote_storage_upload_queue_retries( env.pageserver.stop(immediate=True) env.endpoints.stop_all() - # We are about to forcibly drop local dirs. Storage controller will increment generation in re-attach before - # we later increment when actually attaching it again, leading to skipping a generation and potentially getting - # these warnings if there was a durable but un-executed deletion list at time of restart. - env.pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) @@ -583,7 +577,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( > 0 ) - wait_until(20, 0.1, assert_compacted_and_uploads_queued) + wait_until(200, 0.1, assert_compacted_and_uploads_queued) # Regardless, give checkpoint some time to block for good. # Not strictly necessary, but might help uncover failure modes in the future. @@ -625,7 +619,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( ) # timeline deletion should be unblocking checkpoint ops - checkpoint_thread.join(2.0) + checkpoint_thread.join(20.0) assert not checkpoint_thread.is_alive() # Just to be sure, unblock ongoing uploads. If the previous assert was incorrect, or the prometheus metric broken, @@ -775,11 +769,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv create_thread.join() -def test_compaction_waits_for_upload( +def test_paused_upload_stalls_checkpoint( neon_env_builder: NeonEnvBuilder, ): """ - This test forces a race between upload and compaction. + This test checks that checkpoints block on uploads to remote storage. """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) @@ -794,6 +788,10 @@ def test_compaction_waits_for_upload( } ) + env.pageserver.allowed_errors.append( + f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" + ) + tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -814,76 +812,9 @@ def test_compaction_waits_for_upload( endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)") wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - client.timeline_checkpoint(tenant_id, timeline_id) - deltas_at_first = len(client.layer_map_info(tenant_id, timeline_id).delta_layers()) - assert ( - deltas_at_first == 2 - ), "are you fixing #5863? just add one more checkpoint after 'CREATE TABLE bar ...' statement." - - endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)") - endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - - layers_before_last_checkpoint = client.layer_map_info(tenant_id, timeline_id).historic_by_name() - upload_stuck_layers = layers_before_last_checkpoint - layers_at_creation.historic_by_name() - - assert len(upload_stuck_layers) > 0 - - for name in upload_stuck_layers: - assert env.pageserver.layer_exists( - tenant_id, timeline_id, parse_layer_file_name(name) - ), "while uploads are stuck the layers should be present on disk" - - # now this will do the L0 => L1 compaction and want to remove - # upload_stuck_layers and the original initdb L0 - client.timeline_checkpoint(tenant_id, timeline_id) - - # as uploads are paused, the upload_stuck_layers should still be with us - for name in upload_stuck_layers: - assert env.pageserver.layer_exists( - tenant_id, timeline_id, parse_layer_file_name(name) - ), "uploads are stuck still over compaction" - - compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name() - overlap = compacted_layers.intersection(upload_stuck_layers) - assert len(overlap) == 0, "none of the L0's should remain after L0 => L1 compaction" - assert ( - len(compacted_layers) == 1 - ), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)" - - def layer_deletes_completed(): - m = client.get_metric_value("pageserver_layer_completed_deletes_total") - if m is None: - return 0 - return int(m) - - # if initdb created an initial delta layer, it might already be gc'd - # because it was uploaded before the failpoint was enabled. however, the - # deletion is not guaranteed to be complete. - assert layer_deletes_completed() <= 1 - - client.configure_failpoints(("before-upload-layer-pausable", "off")) - - # Ensure that this actually terminates - wait_upload_queue_empty(client, tenant_id, timeline_id) - - def until_layer_deletes_completed(): - deletes = layer_deletes_completed() - log.info(f"layer_deletes: {deletes}") - # ensure that initdb delta layer AND the previously stuck are now deleted - assert deletes >= len(upload_stuck_layers) + 1 - - wait_until(10, 1, until_layer_deletes_completed) - - for name in upload_stuck_layers: - assert not env.pageserver.layer_exists( - tenant_id, timeline_id, parse_layer_file_name(name) - ), "l0 should now be removed because of L0 => L1 compaction and completed uploads" - - # We should not have hit the error handling path in uploads where a uploaded file is gone - assert not env.pageserver.log_contains( - "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more." - ) + with pytest.raises(ReadTimeout): + client.timeline_checkpoint(tenant_id, timeline_id, timeout=5) + client.configure_failpoints(("before-upload-layer-pausable", "off")) def wait_upload_queue_empty( diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py new file mode 100644 index 0000000000..0d95109d6b --- /dev/null +++ b/test_runner/regress/test_replica_start.py @@ -0,0 +1,690 @@ +""" +In PostgreSQL, a standby always has to wait for a running-xacts WAL record to +arrive before it can start accepting queries. Furthermore, if there are +transactions with too many subxids (> 64) open to fit in the in-memory subxids +cache, the running-xacts record will be marked as "suboverflowed", and the +standby will need to also wait for the currently in-progress transactions to +finish. + +In Neon, we have an additional mechanism that scans the CLOG at server startup +to determine the list of running transactions, so that the standby can start up +immediately without waiting for the running-xacts record, but that mechanism +only works if the # of active (sub-)transactions is reasonably small. Otherwise +it falls back to waiting. Furthermore, it's somewhat optimistic in using up the +known-assigned XIDs array: if too many transactions with subxids are started in +the primary later, the replay in the replica will crash with "too many +KnownAssignedXids" error. + +This module contains tests for those various cases at standby startup: starting +from shutdown checkpoint, using the CLOG scanning mechanism, waiting for +running-xacts record and for in-progress transactions to finish etc. +""" + +import threading +from contextlib import closing + +import psycopg2 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup +from fixtures.pg_version import PgVersion +from fixtures.utils import query_scalar, wait_until + +CREATE_SUBXACTS_FUNC = """ +create or replace function create_subxacts(n integer) returns void as $$ +declare + i integer; +begin + for i in 1..n loop + begin + insert into t (payload) values (0); + exception + when others then + raise exception 'caught something: %', sqlerrm; + end; + end loop; +end; $$ language plpgsql +""" + + +def test_replica_start_scan_clog(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup. There is one + transaction active in the primary when the standby is started. The primary + is killed before it has a chance to write a running-xacts record. The + CLOG-scanning at neon startup allows the standby to start up anyway. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + primary_cur.execute("select pg_switch_wal()") + + # Start a transaction in the primary. Leave the transaction open. + # + # The transaction has some subtransactions, but not too many to cause the + # CLOG-scanning mechanism to give up. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(50)") + + # Wait for the WAL to be flushed, but then immediately kill the primary, + # before it has a chance to generate a running-xacts record. + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + primary.stop(mode="immediate") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (0,) + + +def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup, after + leaving behind crashed transactions. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + primary_cur.execute("select pg_switch_wal()") + + # Consume a lot of XIDs, then kill Postgres without giving it a + # chance to write abort records for them. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(100000)") + primary.stop(mode="immediate") + + # Restart the primary. Do some light work, and shut it down cleanly + primary.start() + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("insert into t (payload) values (0)") + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. (Restarting the primary writes a checkpoint and/or running-xacts + # record, which allows the standby to know that the crashed XIDs are aborted) + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (1,) + + +def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version): + """ + Test that starting a replica works right after the primary has + created a running-xacts record. This may seem like a trivial case, + but during development, we had a bug that was triggered by having + oldestActiveXid == nextXid. Starting right after a running-xacts + record is one way to test that case. + + See the module docstring for background. + """ + env = neon_simple_env + + if env.pg_version == PgVersion.V14 or env.pg_version == PgVersion.V15: + pytest.skip("pg_log_standby_snapshot() function is available only in PG16") + + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("select pg_log_standby_snapshot()") + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select 123") + assert secondary_cur.fetchone() == (123,) + + +def test_replica_start_wait_subxids_finish(neon_simple_env: NeonEnv): + """ + Test replica startup when there are a lot of (sub)transactions active in the + primary. That's too many for the CLOG-scanning mechanism to handle, so the + replica has to wait for the large transaction to finish before it starts to + accept queries. + + After replica startup, test MVCC with transactions that were in-progress + when the replica was started. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create + # lots of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Start a transaction with 100000 subtransactions, and leave it open. That's + # too many to fit in the "known-assigned XIDs array" in the replica, and + # also too many to fit in the subxid caches so the running-xacts record will + # also overflow. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(100000)") + + # Start another, smaller transaction in the primary. We'll come back to this + # later. + primary_conn2 = primary.connect() + primary_cur2 = primary_conn2.cursor() + primary_cur2.execute("begin") + primary_cur2.execute("insert into t (payload) values (0)") + + # Create a replica. but before that, wait for the wal to be flushed to + # safekeepers, so that the replica is started at a point where the large + # transaction is already active. (The whole transaction might not be flushed + # yet, but that's OK.) + # + # Start it in a separate thread, so that we can do other stuff while it's + # blocked waiting for the startup to finish. + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + secondary = env.endpoints.new_replica( + origin=primary, + endpoint_id="secondary", + config_lines=["neon.running_xacts_overflow_policy='wait'"], + ) + start_secondary_thread = threading.Thread(target=secondary.start) + start_secondary_thread.start() + + # Verify that the replica has otherwise started up, but cannot start + # accepting queries yet. + log.info("Waiting 5 s to verify that the secondary does not start") + start_secondary_thread.join(5) + assert secondary.log_contains("consistent recovery state reached") + assert secondary.log_contains("started streaming WAL from primary") + # The "redo starts" message is printed when the first WAL record is + # received. It might or might not be present in the log depending on how + # far exactly the WAL was flushed when the replica was started, and whether + # background activity caused any more WAL records to be flushed on the + # primary afterwards. + # + # assert secondary.log_contains("redo # starts") + + # should not be open for connections yet + assert start_secondary_thread.is_alive() + assert not secondary.is_running() + assert not secondary.log_contains("database system is ready to accept read-only connections") + + # Commit the large transaction in the primary. + # + # Within the next 15 s, the primary should write a new running-xacts record + # to the WAL which shows the transaction as completed. Once the replica + # replays that record, it will start accepting queries. + primary_cur.execute("commit") + start_secondary_thread.join() + + # Verify that the large transaction is correctly visible in the secondary + # (but not the second, small transaction, which is still in-progress!) + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + # Perform some more MVCC testing using the second transaction that was + # started in the primary before the replica was created + primary_cur2.execute("select create_subxacts(10000)") + + # The second transaction still hasn't committed + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("BEGIN ISOLATION LEVEL REPEATABLE READ") + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + # Commit the second transaction in the primary + primary_cur2.execute("commit") + + # Should still be invisible to the old snapshot + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + # Commit the REPEATABLE READ transaction in the replica. Both + # primary transactions should now be visible to a new snapshot. + secondary_cur.execute("commit") + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (110001,) + + +def test_replica_too_many_known_assigned_xids(neon_simple_env: NeonEnv): + """ + The CLOG-scanning mechanism fills the known-assigned XIDs array + optimistically at standby startup, betting that it can still fit + upcoming transactions replayed later from the WAL in the + array. This test tests what happens when that bet fails and the + known-assigned XID array fills up after the standby has already + been started. The WAL redo will fail with an error: + + FATAL: too many KnownAssignedXids + CONTEXT: WAL redo at 0/1895CB0 for neon/INSERT: off: 25, flags: 0x08; blkref #0: rel 1663/5/16385, blk 64 + + which causes the standby to shut down. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Determine how many connections we can use + primary_cur.execute("show max_connections") + max_connections = int(primary_cur.fetchall()[0][0]) + primary_cur.execute("show superuser_reserved_connections") + superuser_reserved_connections = int(primary_cur.fetchall()[0][0]) + n_connections = max_connections - superuser_reserved_connections + n_subxids = 200 + + # Start one top transaction in primary, with lots of subtransactions. This + # uses up much of the known-assigned XIDs space in the standby, but doesn't + # cause it to overflow. + large_p_conn = primary.connect() + large_p_cur = large_p_conn.cursor() + large_p_cur.execute("begin") + large_p_cur.execute(f"select create_subxacts({max_connections} * 30)") + + with closing(primary.connect()) as small_p_conn: + with small_p_conn.cursor() as small_p_cur: + small_p_cur.execute("select create_subxacts(1)") + + # Create a replica at this LSN + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + + # The transaction in primary has not committed yet. + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (1,) + + # Start max number of top transactions in primary, with a lot of + # subtransactions each. We add the subtransactions to each top transaction + # in a round-robin fashion, instead of adding a lot of subtransactions to + # one top transaction at a time. This way, we will have the max number of + # subtransactions in the in-memory subxid cache of each top transaction, + # until they all overflow. + # + # Currently, PGPROC_MAX_CACHED_SUBXIDS == 64, so this will overflow the all + # the subxid caches after creating 64 subxids in each top transaction. The + # point just before the caches have overflowed is the most interesting point + # in time, but we'll keep going beyond that, to ensure that this test is + # robust even if PGPROC_MAX_CACHED_SUBXIDS changes. + p_curs = [] + for _ in range(0, n_connections): + p_cur = primary.connect().cursor() + p_cur.execute("begin") + p_curs.append(p_cur) + + for _subxid in range(0, n_subxids): + for i in range(0, n_connections): + p_curs[i].execute("select create_subxacts(1)") + + # Commit all the transactions in the primary + for i in range(0, n_connections): + p_curs[i].execute("commit") + large_p_cur.execute("commit") + + # Wait until the replica crashes with "too many KnownAssignedXids" error. + def check_replica_crashed(): + try: + secondary.connect() + except psycopg2.Error: + # Once the connection fails, return success + return None + raise RuntimeError("connection succeeded") + + wait_until(20, 0.5, check_replica_crashed) + assert secondary.log_contains("too many KnownAssignedXids") + + # Replica is crashed, so ignore stop result + secondary.check_stop_result = False + + +def test_replica_start_repro_visibility_bug(neon_simple_env: NeonEnv): + """ + Before PR #7288, a hot standby in neon incorrectly started up + immediately, before it had received a running-xacts record. That + led to visibility bugs if there were active transactions in the + primary. This test reproduces the incorrect query results and + incorrectly set hint bits, before that was fixed. + """ + env = neon_simple_env + + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + p_cur = primary.connect().cursor() + + p_cur.execute("begin") + p_cur.execute("create table t(pk integer primary key, payload integer)") + p_cur.execute("insert into t values (generate_series(1,100000), 0)") + + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + wait_replica_caughtup(primary, secondary) + s_cur = secondary.connect().cursor() + + # Set hint bits for pg_class tuples. If primary's transaction is + # not marked as in-progress in MVCC snapshot, then XMIN_INVALID + # hint bit will be set for table's 't' tuple, making it invisible + # even after the commit record is replayed later. + s_cur.execute("select * from pg_class") + + p_cur.execute("commit") + wait_replica_caughtup(primary, secondary) + s_cur.execute("select * from t where pk = 1") + assert s_cur.fetchone() == (1, 0) + + +@pytest.mark.parametrize("shutdown", [True, False]) +def test_replica_start_with_prepared_xacts(neon_simple_env: NeonEnv, shutdown: bool): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + prepared transactions. + + This test is run in two variants: one where the primary server is shut down + before starting the secondary, or not. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"] + ) + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute("create table t1(pk integer primary key)") + primary_cur.execute("create table t2(pk integer primary key)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Prepare a transaction for two-phase commit + primary_cur.execute("begin") + primary_cur.execute("insert into t1 values (1)") + primary_cur.execute("prepare transaction 't1'") + + # Prepare another transaction for two-phase commit, with a subtransaction + primary_cur.execute("begin") + primary_cur.execute("insert into t2 values (2)") + primary_cur.execute("savepoint sp") + primary_cur.execute("insert into t2 values (3)") + primary_cur.execute("prepare transaction 't2'") + + # Start a transaction in the primary. Leave the transaction open. + # + # The transaction has some subtransactions, but not too many to cause the + # CLOG-scanning mechanism to give up. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(50)") + + # Wait for the WAL to be flushed + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + if shutdown: + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"] + ) + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (0,) + secondary_cur.execute("select count(*) from t1") + assert secondary_cur.fetchone() == (0,) + secondary_cur.execute("select count(*) from t2") + assert secondary_cur.fetchone() == (0,) + + if shutdown: + primary.start() + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + else: + primary_cur.execute("commit") + primary_cur.execute("commit prepared 't1'") + primary_cur.execute("commit prepared 't2'") + + wait_replica_caughtup(primary, secondary) + + secondary_cur.execute("select count(*) from t") + if shutdown: + assert secondary_cur.fetchone() == (0,) + else: + assert secondary_cur.fetchone() == (50,) + secondary_cur.execute("select * from t1") + assert secondary_cur.fetchall() == [(1,)] + secondary_cur.execute("select * from t2") + assert secondary_cur.fetchall() == [(2,), (3,)] + + +def test_replica_start_with_prepared_xacts_with_subxacts(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + prepared transactions, with subtransactions. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"] + ) + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + + # Install extension containing function needed for test + primary_cur.execute("CREATE EXTENSION neon_test_utils") + + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Advance nextXid close to the beginning of the next pg_subtrans segment (2^16 XIDs) + # + # This is interesting, because it tests that pg_subtrans is initialized correctly + # at standby startup. (We had a bug where it didn't at one point during development.) + while True: + xid = int(query_scalar(primary_cur, "SELECT txid_current()")) + log.info(f"xid now {xid}") + # Consume 500 transactions at a time until we get close + if xid < 65535 - 600: + primary_cur.execute("select test_consume_xids(500);") + else: + break + primary_cur.execute("checkpoint") + + # Prepare a transaction for two-phase commit + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(1000)") + primary_cur.execute("prepare transaction 't1'") + + # Wait for the WAL to be flushed, and stop the primary + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"] + ) + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (0,) + + primary.start() + + # Open a lot of subtransactions in the primary, causing the subxids cache to overflow + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("select create_subxacts(100000)") + + wait_replica_caughtup(primary, secondary) + + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + primary_cur.execute("commit prepared 't1'") + + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (101000,) + + +def test_replica_start_with_prepared_xacts_with_many_subxacts(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + prepared transactions, with lots of subtransactions. + + Like test_replica_start_with_prepared_xacts_with_subxacts, but with more + subxacts, to test that the prepared transaction's subxids don't consume + space in the known-assigned XIDs array. (They are set in pg_subtrans + instead) + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"] + ) + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + + # Install extension containing function needed for test + primary_cur.execute("CREATE EXTENSION neon_test_utils") + + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Prepare a transaction for two-phase commit, with lots of subxids + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(50000)") + + # to make things a bit more varied, intersperse a few other XIDs in between + # the prepared transaction's sub-XIDs + with primary.connect().cursor() as primary_cur2: + primary_cur2.execute("insert into t (payload) values (123)") + primary_cur2.execute("begin; insert into t (payload) values (-1); rollback") + + primary_cur.execute("select create_subxacts(50000)") + primary_cur.execute("prepare transaction 't1'") + + # Wait for the WAL to be flushed + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"] + ) + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (1,) + + primary.start() + + # Open a lot of subtransactions in the primary, causing the subxids cache to overflow + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("select create_subxacts(100000)") + + wait_replica_caughtup(primary, secondary) + + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100001,) + + primary_cur.execute("commit prepared 't1'") + + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (200001,) + + +def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + large number of unsued XIDs, caused by XID alignment and frequent primary restarts + """ + n_restarts = 50 + + # Initialize the primary and a test table + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + with primary.cursor() as primary_cur: + primary_cur.execute("create table t(pk serial primary key, payload integer)") + + for _ in range(n_restarts): + with primary.cursor() as primary_cur: + primary_cur.execute("insert into t (payload) values (0)") + # restart primary + primary.stop("immediate") + primary.start() + + # Wait for the WAL to be flushed + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + # stop primary to check that we can start replica without it + primary.stop(mode="immediate") + + # Create a replica. It should start up normally, because of ignore policy + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=["neon.running_xacts_overflow_policy='ignore'"], + ) + + # Check that replica see all changes + with secondary.cursor() as secondary_cur: + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (n_restarts,) diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py deleted file mode 100644 index 2360745990..0000000000 --- a/test_runner/regress/test_replication_start.py +++ /dev/null @@ -1,32 +0,0 @@ -import pytest -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup - - -@pytest.mark.xfail -def test_replication_start(neon_simple_env: NeonEnv): - env = neon_simple_env - - with env.endpoints.create_start(branch_name="main", endpoint_id="primary") as primary: - with primary.connect() as p_con: - with p_con.cursor() as p_cur: - p_cur.execute("begin") - p_cur.execute("create table t(pk integer primary key, payload integer)") - p_cur.execute("insert into t values (generate_series(1,100000), 0)") - p_cur.execute("select txid_current()") - xid = p_cur.fetchall()[0][0] - log.info(f"Master transaction {xid}") - with env.endpoints.new_replica_start( - origin=primary, endpoint_id="secondary" - ) as secondary: - wait_replica_caughtup(primary, secondary) - with secondary.connect() as s_con: - with s_con.cursor() as s_cur: - # Enforce setting hint bits for pg_class tuples. - # If master's transaction is not marked as in-progress in MVCC snapshot, - # then XMIN_INVALID hint bit will be set for table's 't' tuple makeing it invisible. - s_cur.execute("select * from pg_class") - p_cur.execute("commit") - wait_replica_caughtup(primary, secondary) - s_cur.execute("select * from t where pk = 1") - assert s_cur.fetchone() == (1, 0) diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 6383d24c57..9992647e56 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -11,8 +11,6 @@ from fixtures.pageserver.utils import ( MANY_SMALL_LAYERS_TENANT_CONFIG, assert_prefix_empty, enable_remote_storage_versioning, - poll_for_remote_storage_iterations, - tenant_delete_wait_completed, wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind, s3_storage @@ -83,8 +81,7 @@ def test_tenant_s3_restore( assert ( ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 ), "tenant removed before we deletion was issued" - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - tenant_delete_wait_completed(ps_http, tenant_id, iterations) + ps_http.tenant_delete(tenant_id) ps_http.deletion_queue_flush(execute=True) assert ( ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 56075c5975..1011a6fd22 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -12,7 +12,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, StorageControllerApiException, - StorageScrubber, last_flush_lsn_upload, tenant_get_shards, wait_for_last_flush_lsn, @@ -48,9 +47,6 @@ def test_sharding_smoke( # Use S3-compatible remote storage so that we can scrub: this test validates # that the scrubber doesn't barf when it sees a sharded tenant. neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() - - neon_env_builder.preserve_database_files = True env = neon_env_builder.init_start( initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size @@ -128,8 +124,8 @@ def test_sharding_smoke( # Check the scrubber isn't confused by sharded content, then disable # it during teardown because we'll have deleted by then - StorageScrubber(neon_env_builder).scan_metadata() - neon_env_builder.scrub_on_exit = False + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy env.storage_controller.pageserver_api().tenant_delete(tenant_id) assert_prefix_empty( @@ -190,24 +186,26 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: """ Test that after a split, we clean up parent layer data in the child shards via compaction. """ + TENANT_CONF = { # small checkpointing and compaction targets to ensure we generate many upload operations - "checkpoint_distance": f"{128 * 1024}", - "compaction_threshold": "1", - "compaction_target_size": f"{128 * 1024}", + "checkpoint_distance": 128 * 1024, + "compaction_threshold": 1, + "compaction_target_size": 128 * 1024, # no PITR horizon, we specify the horizon when we request on-demand GC "pitr_interval": "3600s", # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", "compaction_period": "0s", - # create image layers eagerly, so that GC can remove some layers - "image_creation_threshold": "1", - "image_layer_creation_check_threshold": "0", + # Disable automatic creation of image layers, as we will create them explicitly when we want them + "image_creation_threshold": 9999, + "image_layer_creation_check_threshold": 0, } neon_env_builder.storage_controller_config = { # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. - "max_unavailable": "300s" + "max_offline": "30s", + "max_warming_up": "300s", } env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) @@ -224,6 +222,12 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: workload.validate() workload.stop() + # Do a full image layer generation before splitting, so that when we compact after splitting + # we should only see sizes decrease (from post-split drops/rewrites), not increase (from image layer generation) + env.get_tenant_pageserver(tenant_id).http_client().timeline_checkpoint( + tenant_id, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True + ) + # Split one shard into two shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2) @@ -261,7 +265,9 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: env.pageserver.start() # Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant - env.storage_controller.pageserver_api().set_tenant_config(tenant_id, {"pitr_interval": "0s"}) + updated_conf = TENANT_CONF.copy() + updated_conf["pitr_interval"] = "0s" + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, updated_conf) env.storage_controller.reconcile_until_idle() for shard in shards: @@ -364,9 +370,6 @@ def test_sharding_split_smoke( # Use S3-compatible remote storage so that we can scrub: this test validates # that the scrubber doesn't barf when it sees a sharded tenant. neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() - - neon_env_builder.preserve_database_files = True non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024} @@ -539,6 +542,13 @@ def test_sharding_split_smoke( for k, v in non_default_tenant_config.items(): assert config.effective_config[k] == v + # Check that heatmap uploads remain enabled after shard split + # (https://github.com/neondatabase/neon/issues/8189) + assert ( + config.effective_config["heatmap_period"] + and config.effective_config["heatmap_period"] != "0s" + ) + # Validate pageserver state: expect every child shard to have an attached and secondary location (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id]) assert sum(attached.values()) == split_shard_count @@ -1134,10 +1144,6 @@ def test_sharding_split_failures( ) for ps in env.pageservers: - # When we do node failures and abandon a shard, it will de-facto have old generation and - # thereby be unable to publish remote consistent LSN updates - ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # If we're using a failure that will panic the storage controller, all background # upcalls from the pageserver can fail ps.allowed_errors.append(".*calling control plane generation validation API failed.*") diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 30f96ceee8..94d71a7677 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1,9 +1,10 @@ +import concurrent.futures import json import threading import time from collections import defaultdict from datetime import datetime, timezone -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId @@ -12,9 +13,13 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + PageserverAvailability, + PageserverSchedulingPolicy, PgBin, StorageControllerApiException, + StorageControllerLeadershipStatus, TokenScope, + last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( @@ -24,11 +29,12 @@ from fixtures.pageserver.utils import ( enable_remote_storage_versioning, list_prefix, remote_storage_delete_key, - tenant_delete_wait_completed, timeline_delete_wait_completed, ) from fixtures.pg_version import PgVersion +from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind, s3_storage +from fixtures.storage_controller_proxy import StorageControllerProxy from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until from fixtures.workload import Workload from mypy_boto3_s3.type_defs import ( @@ -61,11 +67,6 @@ def test_storage_controller_smoke( neon_env_builder.num_pageservers = 3 env = neon_env_builder.init_configs() - for pageserver in env.pageservers: - # This test detaches tenants during migration, which can race with deletion queue operations, - # during detach we only do an advisory flush, we don't wait for it. - pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"]) - # Start services by hand so that we can skip a pageserver (this will start + register later) env.broker.try_start() env.storage_controller.start() @@ -158,7 +159,7 @@ def test_storage_controller_smoke( # Delete all the tenants for tid in tenant_ids: - tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10) + env.storage_controller.pageserver_api().tenant_delete(tid) env.storage_controller.consistency_check() @@ -316,7 +317,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up # Create a tenant directly via pageserver HTTP API, skipping the storage controller tenant_id = TenantId.generate() generation = 123 - origin_ps.http_client().tenant_create(tenant_id, generation=generation) + origin_ps.tenant_create(tenant_id, generation=generation) # As if doing a live migration, first configure origin into stale mode r = origin_ps.http_client().tenant_location_conf( @@ -485,9 +486,6 @@ def test_storage_controller_compute_hook( # Start running env = neon_env_builder.init_start() - # We will to an unclean migration, which will result in deletion queue warnings - env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*") - # Initial notification from tenant creation assert len(notifications) == 1 expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { @@ -927,6 +925,8 @@ def test_storage_controller_tenant_deletion( class Failure: pageserver_id: int + offline_timeout: int + must_detect_after: int def apply(self, env: NeonEnv): raise NotImplementedError() @@ -939,9 +939,11 @@ class Failure: class NodeStop(Failure): - def __init__(self, pageserver_ids, immediate): + def __init__(self, pageserver_ids, immediate, offline_timeout, must_detect_after): self.pageserver_ids = pageserver_ids self.immediate = immediate + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after def apply(self, env: NeonEnv): for ps_id in self.pageserver_ids: @@ -957,10 +959,42 @@ class NodeStop(Failure): return self.pageserver_ids +class NodeRestartWithSlowReattach(Failure): + def __init__(self, pageserver_id, offline_timeout, must_detect_after): + self.pageserver_id = pageserver_id + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after + self.thread = None + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.stop(immediate=False) + + def start_ps(): + pageserver.start( + extra_env_vars={"FAILPOINTS": "control-plane-client-re-attach=return(30000)"} + ) + + self.thread = threading.Thread(target=start_ps) + self.thread.start() + + def clear(self, env: NeonEnv): + if self.thread is not None: + self.thread.join() + + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints(("control-plane-client-re-attach", "off")) + + def nodes(self): + return [self.pageserver_id] + + class PageserverFailpoint(Failure): - def __init__(self, failpoint, pageserver_id): + def __init__(self, failpoint, pageserver_id, offline_timeout, must_detect_after): self.failpoint = failpoint self.pageserver_id = pageserver_id + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after def apply(self, env: NeonEnv): pageserver = env.get_pageserver(self.pageserver_id) @@ -996,15 +1030,28 @@ def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]: @pytest.mark.parametrize( "failure", [ - NodeStop(pageserver_ids=[1], immediate=False), - NodeStop(pageserver_ids=[1], immediate=True), - NodeStop(pageserver_ids=[1, 2], immediate=True), - PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"), + NodeStop(pageserver_ids=[1], immediate=False, offline_timeout=20, must_detect_after=5), + NodeStop(pageserver_ids=[1], immediate=True, offline_timeout=20, must_detect_after=5), + NodeStop(pageserver_ids=[1, 2], immediate=True, offline_timeout=20, must_detect_after=5), + PageserverFailpoint( + pageserver_id=1, + failpoint="get-utilization-http-handler", + offline_timeout=20, + must_detect_after=5, + ), + # Instrument a scenario where the node is slow to re-attach. The re-attach request itself + # should serve as a signal to the storage controller to use a more lenient heartbeat timeout. + NodeRestartWithSlowReattach(pageserver_id=1, offline_timeout=60, must_detect_after=15), ], ) def test_storage_controller_heartbeats( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure ): + neon_env_builder.storage_controller_config = { + "max_offline": "10s", + "max_warming_up": "20s", + } + neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_configs() env.start() @@ -1055,13 +1102,6 @@ def test_storage_controller_heartbeats( online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids for node_id in offline_node_ids: - env.get_pageserver(node_id).allowed_errors.append( - # In the case of the failpoint failure, the impacted pageserver - # still believes it has the tenant attached since location - # config calls into it will fail due to being marked offline. - ".*Dropped remote consistent LSN updates.*", - ) - if len(offline_node_ids) > 1: env.get_pageserver(node_id).allowed_errors.append( ".*Scheduling error when marking pageserver.*offline.*", @@ -1077,9 +1117,12 @@ def test_storage_controller_heartbeats( if node["id"] in offline_node_ids: assert node["availability"] == "Offline" - # A node is considered offline if the last successful heartbeat - # was more than 10 seconds ago (hardcoded in the storage controller). - wait_until(20, 1, nodes_offline) + start = time.time() + wait_until(failure.offline_timeout, 1, nodes_offline) + detected_after = time.time() - start + log.info(f"Detected node failures after {detected_after}s") + + assert detected_after >= failure.must_detect_after # .. expecting the tenant on the offline node to be migrated def tenant_migrated(): @@ -1384,7 +1427,8 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant env.storage_controller.allowed_errors.extend( [ - ".*Lock on.*", + ".*Exclusive lock by.*", + ".*Shared lock by.*", ".*Scheduling is disabled by policy.*", f".*Operation TimelineCreate on key {tenant_id} has waited.*", ] @@ -1416,11 +1460,25 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder): ) thread_update_tenant_policy.join() - env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for") - env.storage_controller.assert_log_contains( + env.storage_controller.assert_log_contains("Exclusive lock by UpdatePolicy was held for") + _, last_log_cursor = env.storage_controller.assert_log_contains( f"Operation TimelineCreate on key {tenant_id} has waited" ) + # Test out shared lock + env.storage_controller.configure_failpoints( + ("tenant-create-timeline-shared-lock", "return(31000)") + ) + + timeline_id = TimelineId.generate() + # This will hold the shared lock for enough time to cause an warning + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id + ) + env.storage_controller.assert_log_contains( + "Shared lock by TimelineCreate was held for", offset=last_log_cursor + ) + @pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()]) @pytest.mark.parametrize("shard_count", [None, 4]) @@ -1527,58 +1585,11 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): ) # Give things a chance to settle. - # A call to `reconcile_until_idle` could be used here instead, - # however since all attachments are placed on the same node, - # we'd have to wait for a long time (2 minutes-ish) for optimizations - # to quiesce. - # TODO: once the initial attachment selection is fixed, update this - # to use `reconcile_until_idle`. - time.sleep(2) + env.storage_controller.reconcile_until_idle(timeout_secs=30) nodes = env.storage_controller.node_list() assert len(nodes) == 2 - def retryable_node_operation(op, ps_id, max_attempts, backoff): - while max_attempts > 0: - try: - op(ps_id) - return - except StorageControllerApiException as e: - max_attempts -= 1 - log.info(f"Operation failed ({max_attempts} attempts left): {e}") - - if max_attempts == 0: - raise e - - time.sleep(backoff) - - def poll_node_status(node_id, desired_scheduling_policy, max_attempts, backoff): - log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy") - while max_attempts > 0: - try: - status = env.storage_controller.node_status(node_id) - policy = status["scheduling"] - if policy == desired_scheduling_policy: - return - else: - max_attempts -= 1 - log.info(f"Status call returned {policy=} ({max_attempts} attempts left)") - - if max_attempts == 0: - raise AssertionError( - f"Status for {node_id=} did not reach {desired_scheduling_policy=}" - ) - - time.sleep(backoff) - except StorageControllerApiException as e: - max_attempts -= 1 - log.info(f"Status call failed ({max_attempts} retries left): {e}") - - if max_attempts == 0: - raise e - - time.sleep(backoff) - def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards): # Assert that all nodes have some attached shards assert len(shard_counts) == len(env.pageservers) @@ -1591,10 +1602,18 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): # Perform a graceful rolling restart for ps in env.pageservers: - retryable_node_operation( + env.storage_controller.warm_up_all_secondaries() + + env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 ) - poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) shard_counts = get_node_shard_counts(env, tenant_ids) log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") @@ -1604,12 +1623,24 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): assert sum(shard_counts.values()) == total_shards ps.restart() - poll_node_status(ps.id, "Active", max_attempts=10, backoff=1) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=10, + backoff=1, + ) - retryable_node_operation( + env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 ) - poll_node_status(ps.id, "Active", max_attempts=6, backoff=5) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=6, + backoff=5, + ) shard_counts = get_node_shard_counts(env, tenant_ids) log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") @@ -1619,3 +1650,619 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): shard_counts = get_node_shard_counts(env, tenant_ids) log.info(f"Shard counts after rolling restart: {shard_counts}") assert_shard_counts_balanced(env, shard_counts, total_shards) + + +def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Artificially make a tenant shard's secondary location lag behind the primary + and check that storage controller driven node drains skip the lagging tenant shard. + Finally, validate that the tenant shard is migrated when a new drain request comes + in and it's no longer lagging. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.storage_controller_config = { + "max_secondary_lag_bytes": 1 * 1024 * 1024, + } + + env = neon_env_builder.init_configs() + env.start() + + tid, timeline_id = env.neon_cli.create_tenant(placement_policy='{"Attached":1}') + + # Give things a chance to settle. + env.storage_controller.reconcile_until_idle(timeout_secs=30) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + primary: int = locations[0]["node_id"] + not_primary = [ps.id for ps in env.pageservers if ps.id != primary] + assert len(not_primary) == 1 + secondary = not_primary[0] + + log.info(f"Paused secondary downloads on {secondary}") + env.get_pageserver(secondary).http_client().configure_failpoints( + ("secondary-layer-download-pausable", "pause") + ) + + log.info(f"Ingesting some data for {tid}") + + with env.endpoints.create_start("main", tenant_id=tid) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql("CREATE TABLE created_foo(id integer);") + last_flush_lsn_upload(env, endpoint, tid, timeline_id) + + log.info(f"Uploading heatmap from {primary} and requesting download from {secondary}") + + env.get_pageserver(primary).http_client().tenant_heatmap_upload(tid) + env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100) + + def secondary_is_lagging(): + resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid) + lag = resp["bytes_total"] - resp["bytes_downloaded"] + + if lag <= 1 * 1024 * 1024: + raise Exception(f"Secondary lag not big enough: {lag}") + + log.info(f"Looking for lag to develop on the secondary {secondary}") + wait_until(10, 1, secondary_is_lagging) + + log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}") + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + primary, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + assert locations[0]["node_id"] == primary + + log.info(f"Unpausing secondary downloads on {secondary}") + env.get_pageserver(secondary).http_client().configure_failpoints( + ("secondary-layer-download-pausable", "off") + ) + env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100) + + log.info(f"Waiting for lag to reduce on {secondary}") + + def lag_is_acceptable(): + resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid) + lag = resp["bytes_total"] - resp["bytes_downloaded"] + + if lag > 1 * 1024 * 1024: + raise Exception(f"Secondary lag not big enough: {lag}") + + wait_until(10, 1, lag_is_acceptable) + + env.storage_controller.node_configure(primary, {"scheduling": "Active"}) + + log.info(f"Starting drain of primary {primary} with non-laggy secondary {secondary}") + + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + primary, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + assert locations[0]["node_id"] == secondary + + +def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_count = 10 + shard_count_per_tenant = 8 + tenant_ids = [] + + for _ in range(0, tenant_count): + tid = TenantId.generate() + tenant_ids.append(tid) + env.neon_cli.create_tenant( + tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant + ) + + # See sleep comment in the test above. + time.sleep(2) + + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + + env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(2000)")) + + ps_id_to_drain = env.pageservers[0].id + + env.storage_controller.warm_up_all_secondaries() + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), + ps_id_to_drain, + max_attempts=3, + backoff=2, + ) + + env.storage_controller.poll_node_status( + ps_id_to_drain, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.DRAINING, + max_attempts=6, + backoff=2, + ) + + env.storage_controller.cancel_node_drain(ps_id_to_drain) + + env.storage_controller.poll_node_status( + ps_id_to_drain, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=6, + backoff=2, + ) + + +@pytest.mark.parametrize("while_offline", [True, False]) +def test_storage_controller_node_deletion( + neon_env_builder: NeonEnvBuilder, + compute_reconfigure_listener: ComputeReconfigure, + while_offline: bool, +): + """ + Test that deleting a node works & properly reschedules everything that was on the node. + """ + neon_env_builder.num_pageservers = 3 + env = neon_env_builder.init_configs() + env.start() + + tenant_count = 10 + shard_count_per_tenant = 8 + tenant_ids = [] + for _ in range(0, tenant_count): + tid = TenantId.generate() + tenant_ids.append(tid) + env.neon_cli.create_tenant( + tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant + ) + + victim = env.pageservers[-1] + + # The procedure a human would follow is: + # 1. Mark pageserver scheduling=pause + # 2. Mark pageserver availability=offline to trigger migrations away from it + # 3. Wait for attachments to all move elsewhere + # 4. Call deletion API + # 5. Stop the node. + + env.storage_controller.node_configure(victim.id, {"scheduling": "Pause"}) + + if while_offline: + victim.stop(immediate=True) + env.storage_controller.node_configure(victim.id, {"availability": "Offline"}) + + def assert_shards_migrated(): + counts = get_node_shard_counts(env, tenant_ids) + elsewhere = sum(v for (k, v) in counts.items() if k != victim.id) + log.info(f"Shards on nodes other than on victim: {elsewhere}") + assert elsewhere == tenant_count * shard_count_per_tenant + + wait_until(30, 1, assert_shards_migrated) + + log.info(f"Deleting pageserver {victim.id}") + env.storage_controller.node_delete(victim.id) + + if not while_offline: + + def assert_victim_evacuated(): + counts = get_node_shard_counts(env, tenant_ids) + count = counts[victim.id] + log.info(f"Shards on node {victim.id}: {count}") + assert count == 0 + + wait_until(30, 1, assert_victim_evacuated) + + # The node should be gone from the list API + assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] + + # No tenants should refer to the node in their intent + for tenant_id in tenant_ids: + describe = env.storage_controller.tenant_describe(tenant_id) + for shard in describe["shards"]: + assert shard["node_attached"] != victim.id + assert victim.id not in shard["node_secondary"] + + # Reconciles running during deletion should all complete + # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting + # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3 + # test that hasn't uploaded any heatmaps for secondaries. + # In the interim, just do a reconcile_all to enable the consistency check. + # env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_all() + + # Controller should pass its own consistency checks + env.storage_controller.consistency_check() + + # The node should stay gone across a restart + env.storage_controller.stop() + env.storage_controller.start() + assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] + env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above. + env.storage_controller.consistency_check() + + +@pytest.mark.parametrize("shard_count", [None, 2]) +def test_storage_controller_metadata_health( + neon_env_builder: NeonEnvBuilder, + shard_count: Optional[int], +): + """ + Create three tenants A, B, C. + + Phase 1: + - A: Post healthy status. + - B: Post unhealthy status. + - C: No updates. + + Phase 2: + - B: Post healthy status. + - C: Post healthy status. + + Phase 3: + - A: Post unhealthy status. + + Phase 4: + - Delete tenant A, metadata health status should be deleted as well. + """ + + def update_and_query_metadata_health( + env: NeonEnv, + healthy: List[TenantShardId], + unhealthy: List[TenantShardId], + outdated_duration: str = "1h", + ) -> Tuple[Set[str], Set[str]]: + """ + Update metadata health. Then list tenant shards with unhealthy and + outdated metadata health status. + """ + if healthy or unhealthy: + env.storage_controller.metadata_health_update(healthy, unhealthy) + result = env.storage_controller.metadata_health_list_unhealthy() + unhealthy_res = set(result["unhealthy_tenant_shards"]) + result = env.storage_controller.metadata_health_list_outdated(outdated_duration) + outdated_res = set(record["tenant_shard_id"] for record in result["health_records"]) + + return unhealthy_res, outdated_res + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + # Mock tenant (`initial_tenant``) with healthy scrubber scan result + tenant_a_shard_ids = ( + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=shard_count) + if shard_count is not None + else [TenantShardId(env.initial_tenant, 0, 0)] + ) + + # Mock tenant with unhealthy scrubber scan result + tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count) + tenant_b_shard_ids = ( + env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count) + if shard_count is not None + else [TenantShardId(tenant_b, 0, 0)] + ) + + # Mock tenant that never gets a health update from scrubber + tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count) + + tenant_c_shard_ids = ( + env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count) + if shard_count is not None + else [TenantShardId(tenant_c, 0, 0)] + ) + + # Metadata health table also updated as tenant shards are created. + assert env.storage_controller.metadata_health_is_healthy() + + # post "fake" updates to storage controller db + + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=tenant_a_shard_ids, unhealthy=tenant_b_shard_ids + ) + + log.info(f"After Phase 1: {unhealthy=}, {outdated=}") + assert len(unhealthy) == len(tenant_b_shard_ids) + for t in tenant_b_shard_ids: + assert str(t) in unhealthy + assert len(outdated) == 0 + + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=tenant_b_shard_ids + tenant_c_shard_ids, unhealthy=[] + ) + + log.info(f"After Phase 2: {unhealthy=}, {outdated=}") + assert len(unhealthy) == 0 + assert len(outdated) == 0 + + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=[], unhealthy=tenant_a_shard_ids + ) + + log.info(f"After Phase 3: {unhealthy=}, {outdated=}") + assert len(unhealthy) == len(tenant_a_shard_ids) + for t in tenant_a_shard_ids: + assert str(t) in unhealthy + assert len(outdated) == 0 + + # Phase 4: Delete A + env.storage_controller.pageserver_api().tenant_delete(env.initial_tenant) + + # A's unhealthy metadata health status should be deleted as well. + assert env.storage_controller.metadata_health_is_healthy() + + # All shards from B and C are not fresh if set outdated duration to 0 seconds. + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=[], unhealthy=tenant_a_shard_ids, outdated_duration="0s" + ) + assert len(unhealthy) == 0 + for t in tenant_b_shard_ids + tenant_c_shard_ids: + assert str(t) in outdated + + +def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): + """ + Test the `/control/v1/step_down` storage controller API. Upon receiving such + a request, the storage controller cancels any on-going reconciles and replies + with 503 to all requests apart from `/control/v1/step_down`, `/status` and `/metrics`. + """ + env = neon_env_builder.init_configs() + env.start() + + tid = TenantId.generate() + tsid = str(TenantShardId(tid, shard_number=0, shard_count=0)) + env.storage_controller.tenant_create(tid) + + env.storage_controller.reconcile_until_idle() + env.storage_controller.configure_failpoints(("sleep-on-reconcile-epilogue", "return(10000)")) + + # Make a change to the tenant config to trigger a slow reconcile + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None) + env.storage_controller.allowed_errors.append( + ".*Accepted configuration update but reconciliation failed.*" + ) + + observed_state = env.storage_controller.step_down() + log.info(f"Storage controller stepped down with {observed_state=}") + + # Validate that we waited for the slow reconcile to complete + # and updated the observed state in the storcon before stepping down. + node_id = str(env.pageserver.id) + assert tsid in observed_state + assert node_id in observed_state[tsid]["locations"] + assert "conf" in observed_state[tsid]["locations"][node_id] + assert "tenant_conf" in observed_state[tsid]["locations"][node_id]["conf"] + + tenant_conf = observed_state[tsid]["locations"][node_id]["conf"]["tenant_conf"] + assert "compaction_threshold" in tenant_conf + assert tenant_conf["compaction_threshold"] == 5 + + # Validate that we propagated the change to the pageserver + ps_tenant_conf = env.pageserver.http_client().tenant_config(tid) + assert "compaction_threshold" in ps_tenant_conf.effective_config + assert ps_tenant_conf.effective_config["compaction_threshold"] == 5 + + # Validate that the storcon is not replying to the usual requests + # once it has stepped down. + with pytest.raises(StorageControllerApiException, match="stepped_down"): + env.storage_controller.tenant_list() + + # Validate that we can step down multiple times and the observed state + # doesn't change. + observed_state_again = env.storage_controller.step_down() + assert observed_state == observed_state_again + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "leader"} + ) + == 0 + ) + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "stepped_down"} + ) + == 1 + ) + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "candidate"} + ) + == 0 + ) + + +# This is a copy of NeonEnv.start which injects the instance id and port +# into the call to NeonStorageController.start +def start_env(env: NeonEnv, storage_controller_port: int): + timeout_in_seconds = 30 + + # Storage controller starts first, so that pageserver /re-attach calls don't + # bounce through retries on startup + env.storage_controller.start(timeout_in_seconds, 1, storage_controller_port) + + # Wait for storage controller readiness to prevent unnecessary post start-up + # reconcile. + env.storage_controller.wait_until_ready() + + # Start up broker, pageserver and all safekeepers + futs = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=2 + len(env.pageservers) + len(env.safekeepers) + ) as executor: + futs.append( + executor.submit(lambda: env.broker.try_start() or None) + ) # The `or None` is for the linter + + for pageserver in env.pageservers: + futs.append( + executor.submit( + lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) + ) + ) + + for safekeeper in env.safekeepers: + futs.append( + executor.submit( + lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) + ) + ) + + for f in futs: + f.result() + + +@pytest.mark.parametrize("step_down_times_out", [False, True]) +def test_storage_controller_leadership_transfer( + neon_env_builder: NeonEnvBuilder, + storage_controller_proxy: StorageControllerProxy, + port_distributor: PortDistributor, + step_down_times_out: bool, +): + neon_env_builder.auth_enabled = True + + neon_env_builder.num_pageservers = 3 + + neon_env_builder.storage_controller_config = { + "database_url": f"127.0.0.1:{port_distributor.get_port()}", + "start_as_candidate": True, + } + + neon_env_builder.storage_controller_port_override = storage_controller_proxy.port() + + storage_controller_1_port = port_distributor.get_port() + storage_controller_2_port = port_distributor.get_port() + + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + + env = neon_env_builder.init_configs() + start_env(env, storage_controller_1_port) + + assert ( + env.storage_controller.get_leadership_status() == StorageControllerLeadershipStatus.LEADER + ) + leader = env.storage_controller.get_leader() + assert leader["address"] == f"http://127.0.0.1:{storage_controller_1_port}/" + + if step_down_times_out: + env.storage_controller.configure_failpoints( + ("sleep-on-step-down-handling", "return(10000)") + ) + env.storage_controller.allowed_errors.append(".*request was dropped before completing.*") + + tenant_count = 2 + shard_count = 4 + tenants = set(TenantId.generate() for _ in range(0, tenant_count)) + + for tid in tenants: + env.storage_controller.tenant_create( + tid, shard_count=shard_count, placement_policy={"Attached": 1} + ) + env.storage_controller.reconcile_until_idle() + + env.storage_controller.start( + timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port + ) + + if not step_down_times_out: + + def previous_stepped_down(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.STEPPED_DOWN + ) + + wait_until(5, 1, previous_stepped_down) + + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}") + + def new_becomes_leader(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.LEADER + ) + + wait_until(15, 1, new_becomes_leader) + leader = env.storage_controller.get_leader() + assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/" + + env.storage_controller.wait_until_ready() + env.storage_controller.consistency_check() + + if step_down_times_out: + env.storage_controller.allowed_errors.extend( + [ + ".*Leader.*did not respond to step-down request.*", + ".*Send step down request failed.*", + ".*Send step down request still failed.*", + ] + ) + + +def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder): + # single unsharded tenant, two locations + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_start() + + env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}}) + env.storage_controller.reconcile_until_idle() + + attached_id = int(env.storage_controller.locate(env.initial_tenant)[0]["node_id"]) + attached = next((ps for ps in env.pageservers if ps.id == attached_id)) + + def attached_is_draining(): + details = env.storage_controller.node_status(attached.id) + assert details["scheduling"] == "Draining" + + env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)")) + env.storage_controller.node_drain(attached.id) + + wait_until(10, 0.5, attached_is_draining) + + attached.restart() + + # we are unable to reconfigure node while the operation is still ongoing + with pytest.raises( + StorageControllerApiException, + match="Precondition failed: Ongoing background operation forbids configuring: drain.*", + ): + env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"}) + with pytest.raises( + StorageControllerApiException, + match="Precondition failed: Ongoing background operation forbids configuring: drain.*", + ): + env.storage_controller.node_configure(attached.id, {"availability": "Offline"}) + + env.storage_controller.cancel_node_drain(attached.id) + + def reconfigure_node_again(): + env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"}) + + # allow for small delay between actually having cancelled and being able reconfigure again + wait_until(4, 0.5, reconfigure_node_again) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 35ae61c380..292a9a1010 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -1,14 +1,21 @@ import os +import pprint import shutil +import threading +import time +from concurrent.futures import ThreadPoolExecutor from typing import Optional import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.log_helper import log from fixtures.neon_fixtures import ( + NeonEnv, NeonEnvBuilder, - StorageScrubber, ) +from fixtures.pg_version import PgVersion from fixtures.remote_storage import S3Storage, s3_storage +from fixtures.utils import wait_until from fixtures.workload import Workload @@ -60,8 +67,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: output_path = neon_env_builder.test_output_dir / "snapshot" os.makedirs(output_path) - scrubber = StorageScrubber(neon_env_builder) - scrubber.tenant_snapshot(tenant_id, output_path) + env.storage_scrubber.tenant_snapshot(tenant_id, output_path) assert len(os.listdir(output_path)) > 0 @@ -111,6 +117,14 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: workload.validate() +def drop_local_state(env: NeonEnv, tenant_id: TenantId): + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.reconcile_until_idle() + + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + + @pytest.mark.parametrize("shard_count", [None, 4]) def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) @@ -133,28 +147,415 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt # For each cycle, detach and attach the tenant to bump the generation, and do some writes to generate uploads for _i in range(0, n_cycles): - env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) - env.storage_controller.reconcile_until_idle() - - env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) - env.storage_controller.reconcile_until_idle() + drop_local_state(env, tenant_id) # This write includes remote upload, will generate an index in this generation workload.write_rows(1) + # We will use a min_age_secs=1 threshold for deletion, let it pass + time.sleep(2) + # With a high min_age, the scrubber should decline to delete anything - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600) + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == 0 # If targeting a different tenant, the scrubber shouldn't do anything - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc( + gc_summary = env.storage_scrubber.pageserver_physical_gc( min_age_secs=1, tenant_ids=[TenantId.generate()] ) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == 0 # With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count + + +@pytest.mark.parametrize("shard_count", [None, 2]) +def test_scrubber_physical_gc_ancestors( + neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] +): + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=shard_count, + conf={ + # Small layers and low compaction thresholds, so that when we split we can expect some to + # be dropped by child shards + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + # Disable automatic creation of image layers, as future image layers can result in layers in S3 that + # aren't referenced by children, earlier than the test expects such layers to exist + "image_creation_threshold": "9999", + "image_layer_creation_check_threshold": "0", + # Disable background compaction, we will do it explicitly + "compaction_period": "0s", + # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas + # and makes them GC'able + "pitr_interval": "0s", + }, + ) + + # Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines + env.storage_controller.pageserver_api().timeline_create( + env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate() + ) + + # Make sure the original shard has some layers + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + + new_shard_count = 4 + assert shard_count is None or new_shard_count > shard_count + shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) + env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately + + # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors + env.storage_controller.pageserver_api().timeline_create( + env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate() + ) + + # Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which + # compacts, and we only want to do tha explicitly later in the test. + workload.write_rows(100, upload=False) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + log.info(f"Waiting for shard {shard} on pageserver {ps.id}") + ps.http_client().timeline_checkpoint( + shard, timeline_id, compact=False, wait_until_uploaded=True + ) + + # Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once + # a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even + # if they were logically deleted before the shard split, just not physically deleted yet because of the queue. + for ps in env.pageservers: + ps.http_client().deletion_queue_flush(execute=True) + + # Before compacting, all the layers in the ancestor should still be referenced by the children: the scrubber + # should not erase any ancestor layers + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] == 0 + + # Write some data and compact: compacting, some ancestor layers should no longer be needed by children + # (the compaction is part of the checkpoint that Workload does for us) + workload.churn_rows(100) + workload.churn_rows(100) + workload.churn_rows(100) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + ps.http_client().timeline_compact(shard, timeline_id, force_image_layer_creation=True) + ps.http_client().timeline_gc(shard, timeline_id, 0) + + # We will use a min_age_secs=1 threshold for deletion, let it pass + time.sleep(2) + + # Our time threshold should be respected: check that with a high threshold we delete nothing + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] == 0 + + # Now run with a low time threshold: deletions of ancestor layers should be executed + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] > 0 + + # We deleted some layers: now check we didn't corrupt the tenant by doing so. Detach and + # attach it, to drop any local state, then check it's still readable. + workload.stop() + drop_local_state(env, tenant_id) + workload.validate() + + +def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder): + """ + When we delete a timeline after a shard split, the child shards do not directly delete the + layers in the ancestor shards. They rely on the scrubber to clean up. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=None, + conf={ + # Small layers and low compaction thresholds, so that when we split we can expect some to + # be dropped by child shards + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + # Disable background compaction, we will do it explicitly + "compaction_period": "0s", + # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas + # and makes them GC'able + "pitr_interval": "0s", + }, + ) + + # Make sure the original shard has some layers + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100, upload=False) + workload.stop() + + new_shard_count = 4 + shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + log.info(f"Waiting for shard {shard} on pageserver {ps.id}") + ps.http_client().timeline_checkpoint( + shard, timeline_id, compact=False, wait_until_uploaded=True + ) + + ps.http_client().deletion_queue_flush(execute=True) + + # Create a second timeline so that when we delete the first one, child shards still have some content in S3. + # + # This is a limitation of the scrubber: if a shard isn't in S3 (because it has no timelines), then the scrubber + # doesn't know about it, and won't perceive its ancestors as ancestors. + other_timeline_id = TimelineId.generate() + env.storage_controller.pageserver_api().timeline_create( + PgVersion.NOT_SET, tenant_id, other_timeline_id + ) + + # The timeline still exists in child shards and they reference its layers, so scrubbing + # now shouldn't delete anything. + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] == 0 + + # Delete the timeline + env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id) + + # Subsequently doing physical GC should clean up the ancestor layers + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] > 0 + + +def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): + """ + Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards + which don't reference an ancestor, but some child shards that don't exist yet, then we do not incorrectly + GC any ancestor layers. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + initial_shard_count = 2 + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=initial_shard_count, + conf={ + # Small layers and low compaction thresholds, so that when we split we can expect some to + # be dropped by child shards + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + # Disable background compaction, we will do it explicitly + "compaction_period": "0s", + # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas + # and makes them GC'able + "pitr_interval": "0s", + }, + ) + + unstuck = threading.Event() + + def stuck_split(): + # Pause our shard split after the first shard but before the second, such that when we run + # the scrub, the S3 bucket contains shards 0002, 0101, 0004, 0204 (but not 0104, 0304). + env.storage_controller.configure_failpoints( + ("shard-split-post-remote-sleep", "return(3600000)") + ) + try: + split_response = env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + except Exception as e: + log.info(f"Split failed with {e}") + else: + if not unstuck.is_set(): + raise RuntimeError(f"Split succeeded unexpectedly ({split_response})") + + with ThreadPoolExecutor(max_workers=1) as threads: + log.info("Starting hung shard split") + stuck_split_fut = threads.submit(stuck_split) + + # Let the controller reach the failpoint + wait_until( + 10, + 1, + lambda: env.storage_controller.assert_log_contains( + 'failpoint "shard-split-post-remote-sleep": sleeping' + ), + ) + + # Run compaction on the new child shards, so that they drop some refs to their parent + child_shards = [ + TenantShardId(tenant_id, 0, 4), + TenantShardId(tenant_id, 2, 4), + ] + log.info("Compacting first two children") + for child in child_shards: + env.get_tenant_pageserver( + TenantShardId(tenant_id, 0, initial_shard_count) + ).http_client().timeline_compact(child, timeline_id) + + # Check that the other child shards weren't created + assert env.get_tenant_pageserver(TenantShardId(tenant_id, 1, 4)) is None + assert env.get_tenant_pageserver(TenantShardId(tenant_id, 3, 4)) is None + + # Run scrubber: it should not incorrectly interpret the **04 shards' lack of refs to all + # ancestor layers as a reason to GC them, because it should realize that a split is in progress. + # (GC requires that controller does not indicate split in progress, and that if we see the highest + # shard count N, then there are N shards present with that shard count). + gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + log.info(f"Ran physical GC partway through split: {gc_output}") + assert gc_output["ancestor_layers_deleted"] == 0 + assert gc_output["remote_storage_errors"] == 0 + assert gc_output["controller_api_errors"] == 0 + + # Storage controller shutdown lets our split request client complete + log.info("Stopping storage controller") + unstuck.set() + env.storage_controller.allowed_errors.append(".*Timed out joining HTTP server task.*") + env.storage_controller.stop() + stuck_split_fut.result() + + # Restart the controller and retry the split with the failpoint disabled, this should + # complete successfully and result in an S3 state that allows the scrubber to proceed with removing ancestor layers + log.info("Starting & retrying split") + env.storage_controller.start() + env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + + # The other child shards exist now, we can compact them to drop refs to ancestor + log.info("Compacting second two children") + for child in [ + TenantShardId(tenant_id, 1, 4), + TenantShardId(tenant_id, 3, 4), + ]: + env.get_tenant_pageserver(child).http_client().timeline_compact(child, timeline_id) + + gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + log.info(f"Ran physical GC after split completed: {gc_output}") + assert gc_output["ancestor_layers_deleted"] > 0 + assert gc_output["remote_storage_errors"] == 0 + assert gc_output["controller_api_errors"] == 0 + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_scrubber_scan_pageserver_metadata( + neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] +): + """ + Create some layers. Delete an object listed in index. Run scrubber and see if it detects the defect. + """ + + # Use s3_storage so we could test out scrubber. + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1 + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + + # Create some layers. + + workload = Workload(env, env.initial_tenant, env.initial_timeline) + workload.init() + + for _ in range(3): + workload.write_rows(128) + + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + for _ in range(3): + workload.write_rows(128) + + # Get the latest index for a particular timeline. + + tenant_shard_id = TenantShardId(env.initial_tenant, 0, shard_count if shard_count else 0) + + assert isinstance(env.pageserver_remote_storage, S3Storage) + timeline_path = env.pageserver_remote_storage.timeline_path( + tenant_shard_id, env.initial_timeline + ) + + client = env.pageserver_remote_storage.client + bucket = env.pageserver_remote_storage.bucket_name + objects = client.list_objects_v2(Bucket=bucket, Prefix=f"{timeline_path}/", Delimiter="").get( + "Contents", [] + ) + keys = [obj["Key"] for obj in objects] + index_keys = list(filter(lambda s: s.startswith(f"{timeline_path}/index_part"), keys)) + assert len(index_keys) > 0 + + latest_index_key = env.pageserver_remote_storage.get_latest_index_key(index_keys) + log.info(f"{latest_index_key=}") + + index = env.pageserver_remote_storage.download_index_part(latest_index_key) + + assert len(index.layer_metadata) > 0 + it = iter(index.layer_metadata.items()) + + healthy, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + assert healthy + + assert env.storage_controller.metadata_health_is_healthy() + + # Delete a layer file that is listed in the index. + layer, metadata = next(it) + log.info(f"Deleting {timeline_path}/{layer.to_str()}") + delete_response = client.delete_object( + Bucket=bucket, + Key=f"{timeline_path}/{layer.to_str()}-{metadata.generation:08x}", + ) + log.info(f"delete response: {delete_response}") + + # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings. + _, scan_summary = env.storage_scrubber.scan_metadata() + log.info(f"{pprint.pformat(scan_summary)}") + assert len(scan_summary["with_warnings"]) > 0 + + assert env.storage_controller.metadata_health_is_healthy() + + # Now post to storage controller, expect seeing one unhealthy health record + _, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + log.info(f"{pprint.pformat(scan_summary)}") + assert len(scan_summary["with_warnings"]) > 0 + + unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"] + assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id) + + neon_env_builder.disable_scrub_on_exit() diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index d7f3962620..91caad7220 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -54,4 +54,4 @@ def test_subscriber_restart(neon_simple_env: NeonEnv): pcur.execute(f"INSERT into t values ({n_records}, 0)") n_records += 1 with sub.cursor() as scur: - wait_until(10, 0.5, check_that_changes_propagated) + wait_until(60, 0.5, check_that_changes_propagated) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 2cbb036c0d..9fb7324fa1 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -1,16 +1,14 @@ import json -from contextlib import closing from typing import Any, Dict -import psycopg2.extras from fixtures.common_types import Lsn -from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import wait_until +from fixtures.workload import Workload def test_tenant_config(neon_env_builder: NeonEnvBuilder): @@ -63,25 +61,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): # check the configuration of the default tenant # it should match global configuration - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - log.info(f"show {env.initial_tenant}") - pscur.execute(f"show {env.initial_tenant}") - res = pscur.fetchone() - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 10000, - "compaction_target_size": 1048576, - "compaction_period": 20, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 60 * 60, - "image_creation_threshold": 3, - "pitr_interval": 604800, # 7 days - }.items() - ), f"Unexpected res: {res}" default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant) assert ( not default_tenant_config.tenant_specific_overrides @@ -103,25 +82,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): } # check the configuration of the new tenant - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 20000, - "compaction_target_size": 1048576, - "compaction_period": 20, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 30, - "image_creation_threshold": 3, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" new_tenant_config = http_client.tenant_config(tenant_id=tenant) new_specific_config = new_tenant_config.tenant_specific_overrides assert new_specific_config["checkpoint_distance"] == 20000 @@ -166,25 +126,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): conf=conf_update, ) - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after config res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 15000, - "compaction_target_size": 1048576, - "compaction_period": 80, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 80, - "image_creation_threshold": 2, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" updated_tenant_config = http_client.tenant_config(tenant_id=tenant) updated_specific_config = updated_tenant_config.tenant_specific_overrides assert updated_specific_config["checkpoint_distance"] == 15000 @@ -222,25 +163,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start() - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after restart res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 15000, - "compaction_target_size": 1048576, - "compaction_period": 80, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 80, - "image_creation_threshold": 2, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" restarted_tenant_config = http_client.tenant_config(tenant_id=tenant) assert ( restarted_tenant_config == updated_tenant_config @@ -283,19 +205,10 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start() - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after restart res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "compaction_period": 20, - "pitr_interval": 60, - }.items() - ), f"Unexpected res: {res}" + restarted_final_tenant_config = http_client.tenant_config(tenant_id=tenant) + assert ( + restarted_final_tenant_config == final_tenant_config + ), "Updated config should not change after the restart" def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): @@ -320,10 +233,6 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): assert not config_path.exists(), "detach did not remove config file" - # The re-attach's increment of the generation number may invalidate deletion queue - # updates in flight from the previous attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - env.pageserver.tenant_attach(tenant_id) wait_until( number_of_iterations=5, @@ -357,6 +266,13 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( (tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline ps_http = env.pageserver.http_client() + # When we evict/download layers, we will use this Workload to generate getpage requests + # that touch some layers, as otherwise the pageserver doesn't report totally unused layers + # as problems when they have short residence duration. + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + def get_metric(): metrics = ps_http.get_metrics() metric = metrics.query_one( @@ -377,6 +293,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert default_value == "1day" ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.value) > 0, "metric is updated" @@ -397,6 +314,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert int(metric.value) == 0 ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60 @@ -410,6 +328,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert int(metric.value) == 0, "value resets if label changes" ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60 diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index fd3cc45c3f..dadf5ca672 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -1,17 +1,10 @@ -import concurrent.futures -import enum -import os -import shutil from threading import Thread import pytest from fixtures.common_types import Lsn, TenantId, TimelineId -from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, - StorageScrubber, - last_flush_lsn_upload, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException @@ -19,18 +12,37 @@ from fixtures.pageserver.utils import ( MANY_SMALL_LAYERS_TENANT_CONFIG, assert_prefix_empty, assert_prefix_not_empty, - poll_for_remote_storage_iterations, - tenant_delete_wait_completed, wait_for_upload, - wait_tenant_status_404, - wait_until_tenant_active, - wait_until_tenant_state, ) -from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage +from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.utils import run_pg_bench_small, wait_until from requests.exceptions import ReadTimeout +def error_tolerant_delete(ps_http, tenant_id): + """ + For tests that inject 500 errors, we must retry repeatedly when issuing deletions + """ + while True: + try: + ps_http.tenant_delete(tenant_id=tenant_id) + except PageserverApiException as e: + if e.status_code == 500: + # This test uses failure injection, which can produce 500s as the pageserver expects + # the object store to always be available, and the ListObjects during deletion is generally + # an infallible operation. This can show up as a clear simulated error, or as a general + # error during delete_objects() + assert ( + "simulated failure of remote operation" in e.message + or "failed to delete" in e.message + ) + else: + raise + else: + # Success, drop out + break + + def test_tenant_delete_smoke( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, @@ -54,26 +66,13 @@ def test_tenant_delete_smoke( # first try to delete non existing tenant tenant_id = TenantId.generate() - env.pageserver.allowed_errors.append(".*NotFound.*") - env.pageserver.allowed_errors.append(".*simulated failure.*") + env.pageserver.allowed_errors.extend( + [".*NotFound.*", ".*simulated failure.*", ".*failed to delete .+ objects.*"] + ) # Check that deleting a non-existent tenant gives the expected result: this is a loop because we # may need to retry on some remote storage errors injected by the test harness - while True: - try: - ps_http.tenant_delete(tenant_id=tenant_id) - except PageserverApiException as e: - if e.status_code == 500: - # This test uses failure injection, which can produce 500s as the pageserver expects - # the object store to always be available, and the ListObjects during deletion is generally - # an infallible operation - assert "simulated failure of remote operation" in e.message - elif e.status_code == 404: - # This is our expected result: trying to erase a non-existent tenant gives us 404 - assert "NotFound" in e.message - break - else: - raise + error_tolerant_delete(ps_http, tenant_id) env.neon_cli.create_tenant( tenant_id=tenant_id, @@ -108,10 +107,8 @@ def test_tenant_delete_smoke( # Upload a heatmap so that we exercise deletion of that too ps_http.tenant_heatmap_upload(tenant_id) - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2 - tenant_delete_wait_completed(ps_http, tenant_id, iterations) + error_tolerant_delete(ps_http, tenant_id) assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 tenant_path = env.pageserver.tenant_dir(tenant_id) @@ -129,287 +126,10 @@ def test_tenant_delete_smoke( # Deletion updates the tenant count: the one default tenant remains assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0 - -class Check(enum.Enum): - RETRY_WITHOUT_RESTART = enum.auto() - RETRY_WITH_RESTART = enum.auto() - - -FAILPOINTS = [ - "tenant-delete-before-shutdown", - "tenant-delete-before-create-remote-mark", - "tenant-delete-before-create-local-mark", - "tenant-delete-before-background", - "tenant-delete-before-polling-ongoing-deletions", - "tenant-delete-before-cleanup-remaining-fs-traces", - "tenant-delete-before-remove-timelines-dir", - "tenant-delete-before-remove-deleted-mark", - "tenant-delete-before-remove-tenant-dir", - # Some failpoints from timeline deletion - "timeline-delete-before-index-deleted-at", - "timeline-delete-before-rm", - "timeline-delete-before-index-delete", -] - -FAILPOINTS_BEFORE_BACKGROUND = [ - "timeline-delete-before-schedule", - "tenant-delete-before-shutdown", - "tenant-delete-before-create-remote-mark", - "tenant-delete-before-create-local-mark", - "tenant-delete-before-background", -] - - -def combinations(): - result = [] - - remotes = available_s3_storages() - - for remote_storage_kind in remotes: - for delete_failpoint in FAILPOINTS: - # Simulate failures for only one type of remote storage - # to avoid log pollution and make tests run faster - if remote_storage_kind is RemoteStorageKind.MOCK_S3: - simulate_failures = True - else: - simulate_failures = False - result.append((remote_storage_kind, delete_failpoint, simulate_failures)) - return result - - -@pytest.mark.parametrize("check", list(Check)) -@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations()) -def test_delete_tenant_exercise_crash_safety_failpoints( - neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, - failpoint: str, - simulate_failures: bool, - check: Check, - pg_bin: PgBin, -): - if simulate_failures: - neon_env_builder.pageserver_config_override = "test_remote_failures=1" - - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) - - tenant_id = env.initial_tenant - - env.pageserver.allowed_errors.extend( - [ - # From deletion polling - f".*NotFound: tenant {env.initial_tenant}.*", - # allow errors caused by failpoints - f".*failpoint: {failpoint}", - # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", - # We may leave some upload tasks in the queue. They're likely deletes. - # For uploads we explicitly wait with `last_flush_lsn_upload` below. - # So by ignoring these instead of waiting for empty upload queue - # we execute more distinct code paths. - '.*stopping left-over name="remote upload".*', - # an on-demand is cancelled by shutdown - ".*initial size calculation failed: downloading failed, possibly for shutdown", - ] - ) - - if simulate_failures: - env.pageserver.allowed_errors.append( - # The deletion queue will complain when it encounters simulated S3 errors - ".*deletion executor: DeleteObjects request failed.*", - ) - - ps_http = env.pageserver.http_client() - - timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id) - with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint: - # generate enough layers - run_pg_bench_small(pg_bin, endpoint.connstr()) - last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - - assert_prefix_not_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) - - ps_http.configure_failpoints((failpoint, "return")) - - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - - # These failpoints are earlier than background task is spawned. - # so they result in api request failure. - if failpoint in FAILPOINTS_BEFORE_BACKGROUND: - with pytest.raises(PageserverApiException, match=failpoint): - ps_http.tenant_delete(tenant_id) - - else: - ps_http.tenant_delete(tenant_id) - tenant_info = wait_until_tenant_state( - pageserver_http=ps_http, - tenant_id=tenant_id, - expected_state="Broken", - iterations=iterations, - ) - - reason = tenant_info["state"]["data"]["reason"] - log.info(f"tenant broken: {reason}") - - # failpoint may not be the only error in the stack - assert reason.endswith(f"failpoint: {failpoint}"), reason - - if check is Check.RETRY_WITH_RESTART: - env.pageserver.restart() - - if failpoint in ( - "tenant-delete-before-shutdown", - "tenant-delete-before-create-remote-mark", - ): - wait_until_tenant_active( - ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25 - ) - tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations) - else: - # Pageserver should've resumed deletion after restart. - wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10) - elif check is Check.RETRY_WITHOUT_RESTART: - # this should succeed - # this also checks that delete can be retried even when tenant is in Broken state - ps_http.configure_failpoints((failpoint, "off")) - - tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations) - - tenant_dir = env.pageserver.tenant_dir(tenant_id) - # Check local is empty - assert not tenant_dir.exists() - - # Check remote is empty - assert_prefix_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - allowed_postfix="initdb.tar.zst", - ) - - -def test_tenant_delete_is_resumed_on_attach( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, -): - remote_storage_kind = s3_storage() - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) - env.pageserver.allowed_errors.append( - # lucky race with stopping from flushing a layer we fail to schedule any uploads - ".*layer flush task.+: could not flush frozen layer: update_metadata_file" - ) - - tenant_id = env.initial_tenant - - ps_http = env.pageserver.http_client() - # create two timelines - for timeline in ["first", "second"]: - timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id) - with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: - run_pg_bench_small(pg_bin, endpoint.connstr()) - wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id) - - # sanity check, data should be there - assert_prefix_not_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) - - # failpoint before we remove index_part from s3 - failpoint = "timeline-delete-before-index-delete" - ps_http.configure_failpoints((failpoint, "return")) - - env.pageserver.allowed_errors.extend( - ( - # allow errors caused by failpoints - f".*failpoint: {failpoint}", - # From deletion polling - f".*NotFound: tenant {env.initial_tenant}.*", - # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", - # error from http response is also logged - ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", - '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*', - ) - ) - - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - - ps_http.tenant_delete(tenant_id) - - tenant_info = wait_until_tenant_state( - pageserver_http=ps_http, - tenant_id=tenant_id, - expected_state="Broken", - iterations=iterations, - ) - - assert_prefix_not_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) - - reason = tenant_info["state"]["data"]["reason"] - # failpoint may not be the only error in the stack - assert reason.endswith(f"failpoint: {failpoint}"), reason - - # now we stop pageserver and remove local tenant state - env.endpoints.stop_all() env.pageserver.stop() - dir_to_clear = env.pageserver.tenant_dir() - shutil.rmtree(dir_to_clear) - os.mkdir(dir_to_clear) - - env.pageserver.start() - - # now we call attach - env.pageserver.tenant_attach(tenant_id=tenant_id) - - # delete should be resumed - wait_tenant_status_404(ps_http, tenant_id, iterations) - - # we shouldn've created tenant dir on disk - tenant_path = env.pageserver.tenant_dir(tenant_id) - assert not tenant_path.exists() - - ps_http.deletion_queue_flush(execute=True) - assert_prefix_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) - def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder): """Reproduction of 2023-11-23 stuck tenants investigation""" @@ -482,110 +202,10 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE if deletion is not None: deletion.join() - -def test_tenant_delete_concurrent( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, -): - """ - Validate that concurrent delete requests to the same tenant behave correctly: - exactly one should execute: the rest should give 202 responses but not start - another deletion. - - This is a reproducer for https://github.com/neondatabase/neon/issues/5936 - """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) - ps_http = env.pageserver.http_client() - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - # Populate some data - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - run_pg_bench_small(pg_bin, endpoint.connstr()) - last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - - env.pageserver.allowed_errors.extend( - [ - # lucky race with stopping from flushing a layer we fail to schedule any uploads - ".*layer flush task.+: could not flush frozen layer: update_metadata_file", - ] - ) - - BEFORE_REMOVE_FAILPOINT = "tenant-delete-before-map-remove" - BEFORE_RUN_FAILPOINT = "tenant-delete-before-run" - - # We will let the initial delete run until right before it would remove - # the tenant's TenantSlot. This pauses it in a state where the tenant - # is visible in Stopping state, and concurrent requests should fail with 4xx. - ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "pause")) - - def delete_tenant(): - return ps_http.tenant_delete(tenant_id) - - def hit_remove_failpoint(): - return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1] - - def hit_run_failpoint(): - env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") - - with concurrent.futures.ThreadPoolExecutor() as executor: - background_200_req = executor.submit(delete_tenant) - assert background_200_req.result(timeout=10).status_code == 202 - - # Wait until the first request completes its work and is blocked on removing - # the TenantSlot from tenant manager. - log_cursor = wait_until(100, 0.1, hit_remove_failpoint) - assert log_cursor is not None - - # Start another request: this should succeed without actually entering the deletion code - ps_http.tenant_delete(tenant_id) - assert not env.pageserver.log_contains( - f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor - ) - - # Start another background request, which will pause after acquiring a TenantSlotGuard - # but before completing. - ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "pause")) - background_4xx_req = executor.submit(delete_tenant) - wait_until(100, 0.1, hit_run_failpoint) - - # The TenantSlot is still present while the original request is hung before - # final removal - assert ( - ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 - ) - - # Permit the original request to run to success - ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off")) - - # Permit the duplicate background request to run to completion and fail. - ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off")) - background_4xx_req.result(timeout=10) - assert not env.pageserver.log_contains( - f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor - ) - - # Physical deletion should have happened - assert_prefix_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) - - # Zero tenants remain (we deleted the default tenant) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 - assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0 + env.pageserver.stop() -def test_tenant_delete_races_timeline_creation( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, -): +def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder): """ Validate that timeline creation executed in parallel with deletion works correctly. @@ -674,9 +294,7 @@ def test_tenant_delete_races_timeline_creation( # Disable the failpoint and wait for deletion to finish ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - - tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True) + ps_http.tenant_delete(tenant_id) # Physical deletion should have happened assert_prefix_empty( @@ -698,6 +316,11 @@ def test_tenant_delete_races_timeline_creation( # Zero tenants remain (we deleted the default tenant) assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + + env.pageserver.stop() + def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): """ @@ -707,7 +330,6 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) remote_storage_kind = RemoteStorageKind.MOCK_S3 neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - scrubber = StorageScrubber(neon_env_builder) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) ps_http = env.pageserver.http_client() @@ -722,14 +344,13 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) env.stop() - result = scrubber.scan_metadata() - assert result["with_warnings"] == [] + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy env.start() ps_http = env.pageserver.http_client() - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - tenant_delete_wait_completed(ps_http, tenant_id, iterations) + ps_http.tenant_delete(tenant_id) env.stop() - scrubber.scan_metadata() - assert result["with_warnings"] == [] + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 871351b2d5..b165588636 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -76,10 +76,6 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") @@ -275,16 +271,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # first check for non existing tenant - tenant_id = TenantId.generate() - with pytest.raises( - expected_exception=PageserverApiException, - match=f"NotFound: tenant {tenant_id}", - ) as excinfo: - pageserver_http.tenant_detach(tenant_id) - - assert excinfo.value.status_code == 404 - # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() @@ -344,94 +330,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): pageserver_http.timeline_gc(tenant_id, timeline_id, 0) -# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail), -# then with parameters to force ignored tenant detach (should not fail). -def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv): - env = neon_simple_env - client = env.pageserver.http_client() - - # create a new tenant - tenant_id, _ = env.neon_cli.create_tenant() - - env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - - # assert tenant exists on disk - assert env.pageserver.tenant_dir(tenant_id).exists() - - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - # we rely upon autocommit after each statement - endpoint.safe_psql_many( - queries=[ - "CREATE TABLE t(key int primary key, value text)", - "INSERT INTO t SELECT generate_series(1,100000), 'payload'", - ] - ) - - # ignore tenant - client.tenant_ignore(tenant_id) - env.pageserver.allowed_errors.append(".*NotFound: tenant .*") - # ensure tenant couldn't be detached without the special flag for ignored tenant - log.info("detaching ignored tenant WITHOUT required flag") - with pytest.raises( - expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}" - ): - client.tenant_detach(tenant_id) - - log.info("tenant detached failed as expected") - - # ensure tenant is detached with ignore state - log.info("detaching ignored tenant with required flag") - client.tenant_detach(tenant_id, True) - log.info("ignored tenant detached without error") - - # check that nothing is left on disk for deleted tenant - assert not env.pageserver.tenant_dir(tenant_id).exists() - - # assert the tenant does not exists in the Pageserver - tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] - assert ( - tenant_id not in tenants_after_detach - ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory" - - -# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach. -# Tenant should be detached without issues. -def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): - env = neon_simple_env - client = env.pageserver.http_client() - - # create a new tenant - tenant_id, _ = env.neon_cli.create_tenant() - - env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - - # assert tenant exists on disk - assert env.pageserver.tenant_dir(tenant_id).exists() - - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - # we rely upon autocommit after each statement - endpoint.safe_psql_many( - queries=[ - "CREATE TABLE t(key int primary key, value text)", - "INSERT INTO t SELECT generate_series(1,100000), 'payload'", - ] - ) - - log.info("detaching regular tenant with detach ignored flag") - client.tenant_detach(tenant_id, True) - - log.info("regular tenant detached without error") - - # check that nothing is left on disk for deleted tenant - assert not env.pageserver.tenant_dir(tenant_id).exists() - - # assert the tenant does not exists in the Pageserver - tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] - assert ( - tenant_id not in tenants_after_detach - ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory" - - def test_detach_while_attaching( neon_env_builder: NeonEnvBuilder, ): @@ -447,10 +345,6 @@ def test_detach_while_attaching( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point @@ -500,153 +394,6 @@ def test_detach_while_attaching( cur.execute("SELECT COUNT(*) FROM foo") -# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory. -# * writes some data into tenant's timeline -# * ensures it's synced with the remote storage -# * `ignore` the tenant -# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared -# * verify the ignored tenant is gone from pageserver's memory -# * restart the pageserver and verify that ignored tenant is still not loaded -# * `load` the same tenant -# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines -def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() - - ignored_tenant_id, _ = env.neon_cli.create_tenant() - tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id) - tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] - tenants_before_ignore.sort() - timelines_before_ignore = [ - timeline["timeline_id"] - for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id) - ] - files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")] - - # ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk - pageserver_http.tenant_ignore(ignored_tenant_id) - - files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")] - new_files = set(files_after_ignore_with_retain) - set(files_before_ignore) - disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain) - assert ( - len(disappeared_files) == 0 - ), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}" - assert ( - len(new_files) == 1 - ), f"Only tenant ignore file should appear on disk but got: {new_files}" - - tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] - assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing" - assert len(tenants_after_ignore) + 1 == len( - tenants_before_ignore - ), "Only ignored tenant should be missing" - - # restart the pageserver to ensure we don't load the ignore timeline - env.pageserver.stop() - env.pageserver.start() - tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()] - tenants_after_restart.sort() - assert ( - tenants_after_restart == tenants_after_ignore - ), "Ignored tenant should not be reloaded after pageserver restart" - - # now, load it from the local files and expect it works - env.pageserver.tenant_load(tenant_id=ignored_tenant_id) - wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5) - - tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()] - tenants_after_attach.sort() - assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back" - - timelines_after_ignore = [ - timeline["timeline_id"] - for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id) - ] - assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back" - - -# Tests that it's possible to `load` tenants with missing layers and get them restored: -# * writes some data into tenant's timeline -# * ensures it's synced with the remote storage -# * `ignore` the tenant -# * removes all timeline's local layers -# * `load` the same tenant -# * ensure that it's status is `Active` -# * check that timeline data is restored -def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() - endpoint = env.endpoints.create_start("main") - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - - data_id = 1 - data_secret = "very secret secret" - insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) - - tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] - tenants_before_ignore.sort() - timelines_before_ignore = [ - timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id) - ] - - # ignore the tenant and remove its layers - pageserver_http.tenant_ignore(tenant_id) - timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) - layers_removed = False - for dir_entry in timeline_dir.iterdir(): - if dir_entry.name.startswith("00000"): - # Looks like a layer file. Remove it - dir_entry.unlink() - layers_removed = True - assert layers_removed, f"Found no layers for tenant {timeline_dir}" - - # now, load it from the local files and expect it to work due to remote storage restoration - env.pageserver.tenant_load(tenant_id=tenant_id) - wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) - - tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()] - tenants_after_attach.sort() - assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back" - - timelines_after_ignore = [ - timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id) - ] - assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back" - - endpoint.stop() - endpoint.start() - ensure_test_data(data_id, data_secret, endpoint) - - -# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally -# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored. -def test_load_negatives(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() - env.endpoints.create_start("main") - - tenant_id = env.initial_tenant - - env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - - env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*") - with pytest.raises( - expected_exception=PageserverApiException, - match=f"tenant {tenant_id} already exists, state: Active", - ): - env.pageserver.tenant_load(tenant_id) - - pageserver_http.tenant_ignore(tenant_id) - - def test_detach_while_activating( neon_env_builder: NeonEnvBuilder, ): @@ -667,10 +414,6 @@ def test_detach_while_activating( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - data_id = 1 data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) @@ -770,7 +513,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( wait_until(10, 0.5, found_broken) - client.tenant_ignore(env.initial_tenant) + client.tenant_detach(env.initial_tenant) def found_cleaned_up(): m = client.get_metrics() @@ -782,7 +525,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( wait_until(10, 0.5, found_cleaned_up) - env.pageserver.tenant_load(env.initial_tenant) + env.pageserver.tenant_attach(env.initial_tenant) def found_active(): m = client.get_metrics() diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index be289e03d6..43e9a0d36e 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -15,7 +15,6 @@ from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, wait_for_upload, - wait_tenant_status_404, ) from fixtures.remote_storage import ( LocalFsStorage, @@ -204,8 +203,6 @@ def test_tenant_relocation( [ # Needed for detach polling on the original pageserver f".*NotFound: tenant {tenant_id}.*", - # We will dual-attach in this test, so stale generations are expected - ".*Dropped remote consistent LSN updates.*", ] ) @@ -348,9 +345,6 @@ def test_tenant_relocation( # is no longer involved, and if it is, we will see the error origin_http.tenant_detach(tenant_id) - # Wait a little, so that the detach operation has time to finish. - wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1) - post_migration_check(ep_main, 500500, old_local_path_main) post_migration_check(ep_second, 1001000, old_local_path_second) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index a3dd422903..f872116a1c 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -10,12 +10,12 @@ from fixtures.neon_fixtures import ( Endpoint, NeonEnv, NeonEnvBuilder, + flush_ep_to_pageserver, wait_for_last_flush_lsn, wait_for_wal_insert_lsn, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( - tenant_delete_wait_completed, timeline_delete_wait_completed, wait_until_tenant_active, ) @@ -669,7 +669,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): ), ) - tenant_delete_wait_completed(client, env.initial_tenant, 10) + client.tenant_delete(env.initial_tenant) client.configure_failpoints((failpoint, "off")) @@ -711,3 +711,118 @@ def mask_model_inputs(x): return newlist else: return x + + +@pytest.mark.parametrize("zero_gc", [True, False]) +def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, zero_gc: bool): + """ + Compare a LSN lease to a read-only branch for synthetic size calculation. + They should have the same effect. + """ + + def assert_size_approx_equal_for_lease_test(size_lease, size_branch): + """ + Tests that evaluate sizes are checking the pageserver space consumption + that sits many layers below the user input. The exact space needed + varies slightly depending on postgres behavior. + + Rather than expecting postgres to be determinstic and occasionally + failing the test, we permit sizes for the same data to vary by a few pages. + """ + + # FIXME(yuchen): The delta is too large, used as temp solution to pass the test reliably. + # Investigate and reduce the threshold. + threshold = 22 * 8272 + + log.info( + f"delta: size_branch({size_branch}) - size_lease({size_lease}) = {size_branch - size_lease}" + ) + + assert size_lease == pytest.approx(size_branch, abs=threshold) + + conf = { + "pitr_interval": "0s" if zero_gc else "3600s", + "gc_period": "0s", + "compaction_period": "0s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=conf) + + ro_branch_res = insert_with_action( + env, env.initial_tenant, env.initial_timeline, test_output_dir, action="branch" + ) + + tenant, timeline = env.neon_cli.create_tenant(conf=conf) + lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease") + + assert_size_approx_equal_for_lease_test(lease_res, ro_branch_res) + + # we are writing a lot, and flushing all of that to disk is not important for this test + env.stop(immediate=True) + + +def insert_with_action( + env: NeonEnv, + tenant: TenantId, + timeline: TimelineId, + test_output_dir: Path, + action: str, +) -> int: + """ + Inserts some data on the timeline, perform an action, and insert more data on the same timeline. + Returns the size at the end of the insertion. + + Valid actions: + - "lease": Acquires a lease. + - "branch": Creates a child branch but never writes to it. + """ + + client = env.pageserver.http_client() + with env.endpoints.create_start( + "main", + tenant_id=tenant, + config_lines=["autovacuum=off"], + ) as ep: + initial_size = client.tenant_size(tenant) + log.info(f"initial size: {initial_size}") + + with ep.cursor() as cur: + cur.execute( + "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline) + + if action == "lease": + res = client.timeline_lsn_lease(tenant, timeline, last_flush_lsn) + log.info(f"result from lsn_lease api: {res}") + elif action == "branch": + ro_branch = env.neon_cli.create_branch( + "ro_branch", tenant_id=tenant, ancestor_start_lsn=last_flush_lsn + ) + log.info(f"{ro_branch=} created") + else: + raise AssertionError("Invalid action type, only `lease` and `branch`are accepted") + + with ep.cursor() as cur: + cur.execute( + "CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + cur.execute( + "CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + cur.execute( + "CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + + last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline) + + # Avoid flakiness when calculating logical size. + flush_ep_to_pageserver(env, ep, tenant, timeline) + + size_after_action_and_insert = client.tenant_size(tenant) + log.info(f"{size_after_action_and_insert=}") + + size_debug_file = open(test_output_dir / f"size_debug_{action}.html", "w") + size_debug = client.tenant_size_debug(tenant) + size_debug_file.write(size_debug) + return size_after_action_and_insert diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 93e9ad3673..0ebf714de0 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -41,18 +41,28 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): neon_simple_env.storage_controller.allowed_errors.extend(error_regexes) pageserver_http = neon_simple_env.pageserver.http_client() - pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) - with pytest.raises(Exception, match="tenant-config-before-write"): - _ = neon_simple_env.neon_cli.create_tenant() + # Failure to write a config to local disk makes the pageserver assume that local disk is bad and abort the process + pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) + + tenant_id = TenantId.generate() + + with pytest.raises(requests.exceptions.ConnectionError, match="Connection aborted"): + neon_simple_env.pageserver.http_client().tenant_attach(tenant_id=tenant_id, generation=1) + + # Any files left behind on disk during failed creation do not prevent + # a retry from succeeding. Restart pageserver with no failpoints. + neon_simple_env.pageserver.running = False + neon_simple_env.pageserver.start() + + # The failed creation should not be present in list of tenants, as when we start up we'll see + # an empty tenant dir with no config in it. + neon_simple_env.pageserver.allowed_errors.append(".*Failed to load tenant config.*") new_tenants = sorted( map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) ) assert initial_tenants == new_tenants, "should not create new tenants" - # Any files left behind on disk during failed creation do not prevent - # a retry from succeeding. - pageserver_http.configure_failpoints(("tenant-config-before-write", "off")) neon_simple_env.neon_cli.create_tenant() @@ -369,10 +379,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): # generation nubmers out of order. env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+") - # Our multiple creation requests will advance generation quickly, and when we skip - # a generation number we can generate these warnings - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+") - # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of # an incomplete attach, or some other problem. In the field this should be rare, # so we allow it to log at WARN, even if it is occasionally a false positive. diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 7bf49a0874..840c7159ad 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -48,13 +48,12 @@ def test_threshold_based_eviction( tenant_id, timeline_id = env.initial_tenant, env.initial_timeline ps_http = env.pageserver.http_client() - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { - "kind": "NoEviction" - } + vps_http = env.storage_controller.pageserver_api() + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] is None - eviction_threshold = 5 - eviction_period = 1 - ps_http.set_tenant_config( + eviction_threshold = 10 + eviction_period = 2 + vps_http.set_tenant_config( tenant_id, { "eviction_policy": { @@ -64,7 +63,7 @@ def test_threshold_based_eviction( }, }, ) - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", "threshold": f"{eviction_threshold}s", "period": f"{eviction_period}s", @@ -73,7 +72,7 @@ def test_threshold_based_eviction( # restart because changing tenant config is not instant env.pageserver.restart() - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", "threshold": f"{eviction_threshold}s", "period": f"{eviction_period}s", @@ -81,7 +80,7 @@ def test_threshold_based_eviction( # create a bunch of L1s, only the least of which will need to be resident compaction_threshold = 3 # create L1 layers quickly - ps_http.patch_tenant_config_client_side( + vps_http.patch_tenant_config_client_side( tenant_id, inserts={ # Disable gc and compaction to avoid on-demand downloads from their side. @@ -154,7 +153,7 @@ def test_threshold_based_eviction( while time.time() - started_waiting_at < observation_window: current = ( time.time(), - MapInfoProjection(ps_http.layer_map_info(tenant_id, timeline_id)), + MapInfoProjection(vps_http.layer_map_info(tenant_id, timeline_id)), ) last = map_info_changes[-1] if map_info_changes else (0, None) if last[1] is None or current[1] != last[1]: diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index da37f469b3..6d96dda391 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -485,6 +485,9 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage), ) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + @pytest.mark.parametrize( "stuck_failpoint", @@ -703,6 +706,9 @@ def test_timeline_delete_works_for_remote_smoke( # Assume it is mock server inconsistency and check twice. wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage)) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_delete_orphaned_objects( neon_env_builder: NeonEnvBuilder, diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index f0b2f7d733..d152d0f41f 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1,22 +1,27 @@ import datetime import enum +import threading +import time from concurrent.futures import ThreadPoolExecutor from queue import Empty, Queue from threading import Barrier -from typing import List, Tuple +from typing import List, Set, Tuple import pytest from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( + LogCursor, NeonEnvBuilder, PgBin, + flush_ep_to_pageserver, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException -from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404 -from fixtures.remote_storage import LocalFsStorage -from fixtures.utils import assert_pageserver_backups_equal +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404 +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind +from fixtures.utils import assert_pageserver_backups_equal, wait_until +from requests import ReadTimeout def by_end_lsn(info: HistoricLayerInfo) -> Lsn: @@ -92,7 +97,7 @@ def test_ancestor_detach_branched_from( client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") - wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline) deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers() # there is also the in-mem layer, but ignore it for now @@ -269,7 +274,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None) all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert all_reparented == {reparented, same_branchpoint} + assert set(all_reparented) == {reparented, same_branchpoint} env.pageserver.quiesce_tenants() @@ -406,7 +411,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None - assert ep.safe_psql("SELECT clear_buffer_cache();") + ep.clear_shared_buffers() assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0 ep.stop() @@ -447,6 +452,9 @@ def test_compaction_induced_by_detaches_in_history( } ) env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + env.pageserver.allowed_errors.append( + ".*await_initial_logical_size: can't get semaphore cancel token, skipping" + ) client = env.pageserver.http_client() def delta_layers(timeline_id: TimelineId): @@ -519,6 +527,7 @@ def test_compaction_induced_by_detaches_in_history( assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1 skip_main = branches[1:] + branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"] # take the fullbackup before and after inheriting the new L0s @@ -527,6 +536,13 @@ def test_compaction_induced_by_detaches_in_history( env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before ) + # force initial logical sizes, so we can evict all layers from all + # timelines and exercise on-demand download for copy lsn prefix + client.timeline_detail( + env.initial_tenant, env.initial_timeline, force_await_initial_logical_size=True + ) + client.evict_all_layers(env.initial_tenant, env.initial_timeline) + for _, timeline_id in skip_main: reparented = client.detach_ancestor(env.initial_tenant, timeline_id) assert reparented == set(), "we have no earlier branches at any level" @@ -559,17 +575,88 @@ def test_compaction_induced_by_detaches_in_history( assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set()) -def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() - env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) +@pytest.mark.parametrize("sharded", [True, False]) +def test_timeline_ancestor_detach_idempotent_success( + neon_env_builder: NeonEnvBuilder, sharded: bool +): + shards = 2 if sharded else 1 - client = env.pageserver.http_client() + neon_env_builder.num_pageservers = shards + env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for ps in pageservers.values(): + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + if sharded: + # FIXME: should this be in the neon_env_builder.init_start? + env.storage_controller.reconcile_until_idle() + client = env.storage_controller.pageserver_api() + else: + client = env.pageserver.http_client() + + first_branch = env.neon_cli.create_branch("first_branch") + + _ = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + + # these two will be reparented, and they should be returned in stable order + # from pageservers OR otherwise there will be an `error!` logging from + # storage controller + reparented1 = env.neon_cli.create_branch("first_reparented", ancestor_branch_name="main") + reparented2 = env.neon_cli.create_branch("second_reparented", ancestor_branch_name="main") + + first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch) + assert set(first_reparenting_response) == {reparented1, reparented2} + + # FIXME: this should be done by the http req handler + for ps in pageservers.values(): + ps.quiesce_tenants() + + for _ in range(5): + # once completed, we can retry this how many times + assert ( + client.detach_ancestor(env.initial_tenant, first_branch) == first_reparenting_response + ) + + client.tenant_delete(env.initial_tenant) + + with pytest.raises(PageserverApiException) as e: + client.detach_ancestor(env.initial_tenant, first_branch) + assert e.value.status_code == 404 + + +@pytest.mark.parametrize("sharded", [True, False]) +def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, sharded: bool): + # the test is split from test_timeline_ancestor_detach_idempotent_success as only these error cases should create "request was dropped before completing", + # given the current first error handling + shards = 2 if sharded else 1 + + neon_env_builder.num_pageservers = shards + env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for ps in pageservers.values(): + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + ps.allowed_errors.extend( + [ + ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing", + # rare error logging, which is hard to reproduce without instrumenting responding with random sleep + '.* ERROR .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: Cancelled request finished with an error: Conflict\\("no ancestors"\\)', + ] + ) + + client = ( + env.pageserver.http_client() if not sharded else env.storage_controller.pageserver_api() + ) with pytest.raises(PageserverApiException, match=".* no ancestors") as info: client.detach_ancestor(env.initial_tenant, env.initial_timeline) assert info.value.status_code == 409 - first_branch = env.neon_cli.create_branch("first_branch") + _ = env.neon_cli.create_branch("first_branch") + second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") # funnily enough this does not have a prefix @@ -577,18 +664,830 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): client.detach_ancestor(env.initial_tenant, second_branch) assert info.value.status_code == 400 - client.tenant_delete(env.initial_tenant) - wait_tenant_status_404(client, env.initial_tenant, 10, 1) - with pytest.raises(PageserverApiException) as e: - client.detach_ancestor(env.initial_tenant, first_branch) - assert e.value.status_code == 404 +def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): + """ + Sharded timeline detach ancestor; 4 nodes: 1 stuck, 1 restarted, 2 normal. + + Stuck node gets stuck on a pause failpoint for first storage controller request. + Restarted node remains stuck until explicit restart from test code. + + We retry the request until storage controller gets 200 OK from all nodes. + """ + branch_name = "soon_detached" + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + # FIXME: should this be in the neon_env_builder.init_start? + env.storage_controller.reconcile_until_idle() + # as we will stop a node, make sure there is no clever rebalancing + env.storage_controller.tenant_policy_update(env.initial_tenant, body={"scheduling": "Stop"}) + env.storage_controller.allowed_errors.append(".*: Scheduling is disabled by policy Stop .*") + + shards = env.storage_controller.locate(env.initial_tenant) + + utilized_pageservers = {x["node_id"] for x in shards} + assert len(utilized_pageservers) > 1, "all shards got placed on single pageserver?" + + branch_timeline_id = env.neon_cli.create_branch(branch_name, tenant_id=env.initial_tenant) + + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + ep.safe_psql( + "create table foo as select 1::bigint, i::bigint from generate_series(1, 10000) v(i)" + ) + lsn = flush_ep_to_pageserver(env, ep, env.initial_tenant, branch_timeline_id) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for shard_info in shards: + node_id = int(shard_info["node_id"]) + shard_id = shard_info["shard_id"] + detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id) + + assert Lsn(detail["last_record_lsn"]) >= lsn + assert Lsn(detail["initdb_lsn"]) < lsn + assert TimelineId(detail["ancestor_timeline_id"]) == env.initial_timeline + + # make one of the nodes get stuck, but continue the initial operation + # make another of the nodes get stuck, then restart + + stuck = pageservers[int(shards[0]["node_id"])] + log.info(f"stuck pageserver is id={stuck.id}") + stuck_http = stuck.http_client() + stuck_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause") + ) + + restarted = pageservers[int(shards[1]["node_id"])] + log.info(f"restarted pageserver is id={restarted.id}") + # this might be hit; see `restart_restarted` + restarted.allowed_errors.append(".*: Cancelled request finished with an error: ShuttingDown") + assert restarted.id != stuck.id + restarted_http = restarted.http_client() + restarted_http.configure_failpoints( + [ + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause"), + ] + ) + + for info in shards: + pageserver = pageservers[int(info["node_id"])] + # the first request can cause these, but does not repeatedly + pageserver.allowed_errors.append(".*: request was dropped before completing") + + # first request again + env.storage_controller.allowed_errors.append(".*: request was dropped before completing") + + target = env.storage_controller.pageserver_api() + + with pytest.raises(ReadTimeout): + target.detach_ancestor(env.initial_tenant, branch_timeline_id, timeout=1) + + stuck_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "off") + ) + + barrier = threading.Barrier(2) + + def restart_restarted(): + barrier.wait() + # graceful shutdown should just work, because simultaneously unpaused + restarted.stop() + # this does not happen always, depends how fast we exit after unpausing + # restarted.assert_log_contains("Cancelled request finished with an error: ShuttingDown") + restarted.start() + + with ThreadPoolExecutor(max_workers=1) as pool: + fut = pool.submit(restart_restarted) + barrier.wait() + # we have 10s, lets use 1/2 of that to help the shutdown start + time.sleep(5) + restarted_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "off") + ) + fut.result() + + # detach ancestor request handling is not sensitive to http cancellation. + # this means that the "stuck" is on its way to complete the detach, but the restarted is off + # now it can either be complete on all nodes, or still in progress with + # one. + without_retrying = target.without_status_retrying() + + # this retry loop will be long enough that the tenant can always activate + reparented = None + for _ in range(10): + try: + reparented = without_retrying.detach_ancestor(env.initial_tenant, branch_timeline_id) + except PageserverApiException as info: + assert info.status_code == 503 + time.sleep(2) + else: + break + + assert reparented == set(), "too many retries (None) or unexpected reparentings" + + for shard_info in shards: + node_id = int(shard_info["node_id"]) + shard_id = shard_info["shard_id"] + + # TODO: ensure quescing is done on pageserver? + pageservers[node_id].quiesce_tenants() + detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id) + wait_for_last_record_lsn( + pageservers[node_id].http_client(), shard_id, branch_timeline_id, lsn + ) + assert detail.get("ancestor_timeline_id") is None + + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + count = int(ep.safe_psql("select count(*) from foo")[0][0]) + assert count == 10000 + + +@pytest.mark.parametrize("mode", ["delete_timeline", "delete_tenant"]) +@pytest.mark.parametrize("sharded", [False, True]) +def test_timeline_detach_ancestor_interrupted_by_deletion( + neon_env_builder: NeonEnvBuilder, mode: str, sharded: bool +): + """ + Timeline ancestor detach interrupted by deleting either: + - the detached timeline + - the whole tenant + + after starting the detach. + + What remains not tested by this: + - shutdown winning over complete, see test_timeline_is_deleted_before_timeline_detach_ancestor_completes + """ + + if sharded and mode == "delete_tenant": + # the shared/exclusive lock for tenant is blocking this: + # timeline detach ancestor takes shared, delete tenant takes exclusive + pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen") + + shard_count = 2 if sharded else 1 + + neon_env_builder.num_pageservers = shard_count + + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count if sharded else None, + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + }, + ) + + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + detached_timeline = env.neon_cli.create_branch("detached soon", "main") + + pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable" + + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + + assert len(set(info["node_id"] for info in shards)) == shard_count + + target = env.storage_controller.pageserver_api() if sharded else env.pageserver.http_client() + target = target.without_status_retrying() + + victim = pageservers[int(shards[-1]["node_id"])] + victim_http = victim.http_client() + victim_http.configure_failpoints((pausepoint, "pause")) + + def detach_ancestor(): + target.detach_ancestor(env.initial_tenant, detached_timeline) + + def at_failpoint() -> LogCursor: + msg, offset = victim.assert_log_contains(f"at failpoint {pausepoint}") + log.info(f"found {msg}") + msg, offset = victim.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + log.info(f"found {msg}") + return offset + + def start_delete(): + if mode == "delete_timeline": + target.timeline_delete(env.initial_tenant, detached_timeline) + elif mode == "delete_tenant": + target.tenant_delete(env.initial_tenant) + else: + raise RuntimeError(f"unimplemented mode {mode}") + + def at_waiting_on_gate_close(start_offset: LogCursor) -> LogCursor: + _, offset = victim.assert_log_contains( + "closing is taking longer than expected", offset=start_offset + ) + return offset + + def is_deleted(): + try: + if mode == "delete_timeline": + target.timeline_detail(env.initial_tenant, detached_timeline) + elif mode == "delete_tenant": + target.tenant_status(env.initial_tenant) + else: + return False + except PageserverApiException as e: + assert e.status_code == 404 + return True + else: + raise RuntimeError("waiting for 404") + + with ThreadPoolExecutor(max_workers=2) as pool: + try: + fut = pool.submit(detach_ancestor) + offset = wait_until(10, 1.0, at_failpoint) + + delete = pool.submit(start_delete) + + offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) + + victim_http.configure_failpoints((pausepoint, "off")) + + delete.result() + + assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}" + + # TODO: match the error + with pytest.raises(PageserverApiException) as exc: + fut.result() + log.info(f"TODO: match this error: {exc.value}") + assert exc.value.status_code == 503 + finally: + victim_http.configure_failpoints((pausepoint, "off")) + + if mode != "delete_timeline": + return + + # make sure the gc is unblocked + time.sleep(2) + victim.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) + + if not sharded: + # we have the other node only while sharded + return + + other = pageservers[int(shards[0]["node_id"])] + log.info(f"other is {other.id}") + _, offset = other.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/\\S+/timeline/\\S+/detach_ancestor .*\\}: Request handled, status: 200 OK", + ) + # this might be a lot earlier than the victims line, but that is okay. + _, offset = other.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) + + +@pytest.mark.parametrize("mode", ["delete_reparentable_timeline", "create_reparentable_timeline"]) +def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnvBuilder, mode: str): + """ + Technically possible storage controller concurrent interleaving timeline + deletion with timeline detach. + + Deletion is fine, as any sharded pageservers reach the same end state, but + creating reparentable timeline would create an issue as the two nodes would + never agree. There is a solution though: the created reparentable timeline + must be detached. + """ + + shard_count = 2 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + assert len(set(x["node_id"] for x in shards)) == shard_count + + with env.endpoints.create_start("main") as ep: + ep.safe_psql("create table foo as select i::bigint from generate_series(1, 1000) t(i)") + + # as the interleaved operation, we will delete this timeline, which was reparenting candidate + first_branch_lsn = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]: + ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline) + + ep.safe_psql("create table bar as select i::bigint from generate_series(1, 2000) t(i)") + detached_branch_lsn = flush_ep_to_pageserver( + env, ep, env.initial_tenant, env.initial_timeline + ) + + for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]: + ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline) + + def create_reparentable_timeline() -> TimelineId: + return env.neon_cli.create_branch( + "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn + ) + + if mode == "delete_reparentable_timeline": + first_branch = create_reparentable_timeline() + else: + first_branch = None + + detached_branch = env.neon_cli.create_branch( + "detached_branch", ancestor_branch_name="main", ancestor_start_lsn=detached_branch_lsn + ) + + pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable" + + stuck = pageservers[int(shards[0]["node_id"])] + stuck_http = stuck.http_client().without_status_retrying() + stuck_http.configure_failpoints((pausepoint, "pause")) + + victim = pageservers[int(shards[-1]["node_id"])] + victim_http = victim.http_client().without_status_retrying() + victim_http.configure_failpoints( + (pausepoint, "pause"), + ) + + # interleaving a create_timeline which could be reparented will produce two + # permanently different reparentings: one node has reparented, other has + # not + # + # with deletion there is no such problem + def detach_timeline(): + env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, detached_branch) + + def paused_at_failpoint(): + stuck.assert_log_contains(f"at failpoint {pausepoint}") + victim.assert_log_contains(f"at failpoint {pausepoint}") + + def first_completed(): + detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch) + log.info(detail) + assert detail.get("ancestor_lsn") is None + + def first_branch_gone(): + assert first_branch is not None + try: + env.storage_controller.pageserver_api().timeline_detail( + env.initial_tenant, first_branch + ) + except PageserverApiException as e: + log.info(f"error {e}") + assert e.status_code == 404 + else: + log.info("still ok") + raise RuntimeError("not done yet") + + with ThreadPoolExecutor(max_workers=1) as pool: + try: + fut = pool.submit(detach_timeline) + wait_until(10, 1.0, paused_at_failpoint) + + # let stuck complete + stuck_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_completed) + + if mode == "delete_reparentable_timeline": + assert first_branch is not None + env.storage_controller.pageserver_api().timeline_delete( + env.initial_tenant, first_branch + ) + victim_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_branch_gone) + elif mode == "create_reparentable_timeline": + first_branch = create_reparentable_timeline() + victim_http.configure_failpoints((pausepoint, "off")) + else: + raise RuntimeError("{mode}") + + # it now passes, and we should get an error messages about mixed reparenting as the stuck still had something to reparent + mixed_results = "pageservers returned mixed results for ancestor detach; manual intervention is required." + with pytest.raises(PageserverApiException, match=mixed_results): + fut.result() + + msg, offset = env.storage_controller.assert_log_contains( + ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*" + ) + log.info(f"expected error message: {msg.rstrip()}") + env.storage_controller.allowed_errors.extend( + [ + ".*: shards returned different results matching=0 .*", + f".*: InternalServerError\\({mixed_results}", + ] + ) + + if mode == "create_reparentable_timeline": + with pytest.raises(PageserverApiException, match=mixed_results): + detach_timeline() + else: + # it is a bit shame to flag it and then it suceeds, but most + # likely there would be a retry loop which would take care of + # this in cplane + detach_timeline() + + retried = env.storage_controller.log_contains( + ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*", + offset, + ) + if mode == "delete_reparentable_timeline": + assert ( + retried is None + ), "detaching should had converged after both nodes saw the deletion" + elif mode == "create_reparentable_timeline": + assert retried is not None, "detaching should not have converged" + _, offset = retried + finally: + stuck_http.configure_failpoints((pausepoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) + + if mode == "create_reparentable_timeline": + assert first_branch is not None + # now we have mixed ancestry + assert ( + TimelineId( + stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)[ + "ancestor_timeline_id" + ] + ) + == env.initial_timeline + ) + assert ( + TimelineId( + victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[ + "ancestor_timeline_id" + ] + ) + == detached_branch + ) + + # make sure we are still able to repair this by detaching the ancestor on the storage controller in case it ever happens + # if the ancestor would be deleted, we would partially fail, making deletion stuck. + env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, first_branch) + + # and we should now have good results + not_found = env.storage_controller.log_contains( + ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*", + offset, + ) + + assert not_found is None + assert ( + stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)["ancestor_timeline_id"] + is None + ) + assert ( + victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[ + "ancestor_timeline_id" + ] + is None + ) + + +def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( + neon_env_builder: NeonEnvBuilder, +): + shard_count = 2 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + assert len(set(x["node_id"] for x in shards)) == shard_count + + detached_branch = env.neon_cli.create_branch("detached_branch", ancestor_branch_name="main") + + pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable" + failpoint = "timeline-detach-ancestor::before_starting_after_locking" + + stuck = pageservers[int(shards[0]["node_id"])] + stuck_http = stuck.http_client().without_status_retrying() + stuck_http.configure_failpoints( + (pausepoint, "pause"), + ) + + env.storage_controller.allowed_errors.append( + f".*Error processing HTTP request: .* failpoint: {failpoint}" + ) + http = env.storage_controller.pageserver_api() + + victim = pageservers[int(shards[-1]["node_id"])] + victim.allowed_errors.append( + f".*Error processing HTTP request: InternalServerError\\(failpoint: {failpoint}" + ) + victim_http = victim.http_client().without_status_retrying() + victim_http.configure_failpoints([(pausepoint, "pause"), (failpoint, "return")]) + + def detach_timeline(): + http.detach_ancestor(env.initial_tenant, detached_branch) + + def paused_at_failpoint(): + stuck.assert_log_contains(f"at failpoint {pausepoint}") + victim.assert_log_contains(f"at failpoint {pausepoint}") + + def first_completed(): + detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch) + log.info(detail) + assert detail.get("ancestor_lsn") is None + + with ThreadPoolExecutor(max_workers=1) as pool: + try: + fut = pool.submit(detach_timeline) + wait_until(10, 1.0, paused_at_failpoint) + + # let stuck complete + stuck_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_completed) + + victim_http.configure_failpoints((pausepoint, "off")) + + with pytest.raises( + PageserverApiException, + match=f".*failpoint: {failpoint}", + ) as exc: + fut.result() + assert exc.value.status_code == 500 + + finally: + stuck_http.configure_failpoints((pausepoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) + + victim_http.configure_failpoints((failpoint, "off")) + detach_timeline() + + +def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: NeonEnvBuilder): + """ + Using a failpoint, force the completion step of timeline ancestor detach to + fail after reparenting a single timeline. + + Retrying should try reparenting until all reparentings are done, all the + time blocking gc even across restarts (first round). + + A completion failpoint is used to inhibit completion on second to last + round. + + On last round, the completion uses a path where no reparentings can happen + because original ancestor is deleted, and there is a completion to unblock + gc without restart. + """ + + # to get the remote storage metrics + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + env.pageserver.allowed_errors.extend( + [ + ".* reparenting failed: failpoint: timeline-detach-ancestor::allow_one_reparented", + ".* Error processing HTTP request: InternalServerError\\(failed to reparent all candidate timelines, please retry", + ".* Error processing HTTP request: InternalServerError\\(failpoint: timeline-detach-ancestor::complete_before_uploading", + ] + ) + + http = env.pageserver.http_client() + + def remote_storage_copy_requests(): + return http.get_metric_value( + "remote_storage_s3_request_seconds_count", + {"request_type": "copy_object", "result": "ok"}, + ) + + def reparenting_progress(timelines: List[TimelineId]) -> Tuple[int, Set[TimelineId]]: + reparented = 0 + not_reparented = set() + for timeline in timelines: + detail = http.timeline_detail(env.initial_tenant, timeline) + ancestor = TimelineId(detail["ancestor_timeline_id"]) + if ancestor == detached: + reparented += 1 + else: + not_reparented.add(timeline) + return (reparented, not_reparented) + + # main ------A-----B-----C-----D-----E> lsn + timelines = [] + with env.endpoints.create_start("main") as ep: + for counter in range(5): + ep.safe_psql( + f"create table foo_{counter} as select i::bigint from generate_series(1, 10000) t(i)" + ) + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + http.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + branch = env.neon_cli.create_branch( + f"branch_{counter}", "main", ancestor_start_lsn=branch_lsn + ) + timelines.append(branch) + + flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline) + + # detach "E" which has most reparentable timelines under it + detached = timelines.pop() + assert len(timelines) == 4 + + http = http.without_status_retrying() + + http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return")) + + not_reparented: Set[TimelineId] = set() + # tracked offset in the pageserver log which is at least at the most recent activation + offset = None + + def try_detach(): + with pytest.raises( + PageserverApiException, + match=".*failed to reparent all candidate timelines, please retry", + ) as exc: + http.detach_ancestor(env.initial_tenant, detached) + assert exc.value.status_code == 503 + + # first round -- do more checking to make sure the gc gets paused + try_detach() + + assert ( + http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None + ), "first round should had detached 'detached'" + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == 1 + + time.sleep(2) + _, offset = env.pageserver.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request", + offset, + ) + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + _, offset = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + metric = remote_storage_copy_requests() + assert metric != 0 + # make sure the gc blocking is persistent over a restart + env.pageserver.restart() + env.pageserver.quiesce_tenants() + time.sleep(2) + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + assert env.pageserver.log_contains(".* gc_loop.*: [0-9] timelines need GC", offset) is None + _, offset = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + # restore failpoint for the next reparented + http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return")) + + reparented_before = reparented + + # do two more rounds + for _ in range(2): + try_detach() + + assert ( + http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None + ), "first round should had detached 'detached'" + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == reparented_before + 1 + reparented_before = reparented + + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + metric = remote_storage_copy_requests() + assert metric == 0, "copies happen in the first round" + + assert offset is not None + assert len(not_reparented) == 1 + + http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "return")) + + # almost final round, the failpoint is hit no longer as there is only one reparented and one always gets to succeed. + # the tenant is restarted once more, but we fail during completing. + with pytest.raises( + PageserverApiException, match=".* timeline-detach-ancestor::complete_before_uploading" + ) as exc: + http.detach_ancestor(env.initial_tenant, detached) + assert exc.value.status_code == 500 + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + + # delete the previous ancestor to take a different path to completion. all + # other tests take the "detach? reparent complete", but this only hits + # "complete". + http.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20) + + http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off")) + + reparented_resp = http.detach_ancestor(env.initial_tenant, detached) + assert reparented_resp == set(timelines) + # no need to quiesce_tenants anymore, because completion does that + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == len(timelines) + + time.sleep(2) + assert ( + env.pageserver.log_contains(".*: attach finished, activating", offset) is None + ), "there should be no restart with the final detach_ancestor as it only completed" + + # gc is unblocked + env.pageserver.assert_log_contains(".* gc_loop.*: 5 timelines need GC", offset) + + metric = remote_storage_copy_requests() + assert metric == 0 + + +def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( + neon_env_builder: NeonEnvBuilder, +): + """ + Make sure that a timeline deleted after restart will unpause gc blocking. + """ + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + http = env.pageserver.http_client() + + detached = env.neon_cli.create_branch("detached") + + failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable" + + http.configure_failpoints((failpoint, "pause")) + + def detach_and_get_stuck(): + return http.detach_ancestor(env.initial_tenant, detached) + + def request_processing_noted_in_log(): + _, offset = env.pageserver.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request", + ) + return offset + + def delete_detached(): + return http.timeline_delete(env.initial_tenant, detached) + + try: + with ThreadPoolExecutor(max_workers=1) as pool: + detach = pool.submit(detach_and_get_stuck) + + offset = wait_until(10, 1.0, request_processing_noted_in_log) + + # make this named fn tor more clear failure test output logging + def pausepoint_hit_with_gc_paused() -> LogCursor: + env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + _, at = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + return at + + offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused) + + delete_detached() + + wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0) + + http.configure_failpoints((failpoint, "off")) + + with pytest.raises( + PageserverApiException, match="NotFound: Timeline .* was not found" + ) as exc: + detach.result() + assert exc.value.status_code == 404 + finally: + http.configure_failpoints((failpoint, "off")) + + # make sure gc has been unblocked + time.sleep(2) + + env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) # TODO: -# - after starting the operation, tenant is deleted -# - after starting the operation, pageserver is shutdown, restarted -# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited -# - deletion of reparented while reparenting should fail once, then succeed (?) # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. +# +# TEST: 1. tad which partially succeeds, one returns 500 +# 2. create branch below timeline? ~or delete reparented timeline~ (done) +# 3. on retry all should report the same reparented timelines diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py new file mode 100644 index 0000000000..24de894687 --- /dev/null +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -0,0 +1,67 @@ +import time + +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) +from fixtures.pageserver.utils import wait_timeline_detail_404 + + +def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start( + initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"} + ) + ps = env.pageserver + http = ps.http_client() + + foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant) + + gc_active_line = ".* gc_loop.*: [12] timelines need GC" + gc_skipped_line = ".* gc_loop.*: Skipping GC: .*" + init_gc_skipped = ".*: initialized with gc blocked.*" + + tenant_before = http.tenant_status(env.initial_tenant) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line) + + assert ps.log_contains(gc_skipped_line, offset) is None + + http.timeline_block_gc(env.initial_tenant, foo_branch) + + tenant_after = http.tenant_status(env.initial_tenant) + assert tenant_before != tenant_after + gc_blocking = tenant_after["gc_blocking"] + assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }" + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + ps.restart() + ps.quiesce_tenants() + + _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + # deletion unblocks gc + http.timeline_delete(env.initial_tenant, foo_branch) + wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line, offset) + + http.timeline_block_gc(env.initial_tenant, env.initial_timeline) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + # removing the manual block also unblocks gc + http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line, offset) + + +def wait_for_another_gc_round(): + time.sleep(2) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index db5297870e..642b9e449b 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -26,7 +26,6 @@ from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, wait_for_upload_queue_empty, - wait_tenant_status_404, wait_until_tenant_active, ) from fixtures.pg_version import PgVersion @@ -153,10 +152,12 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) + size_limit_mb = 30 + endpoint_main = env.endpoints.create( "test_timeline_size_quota_on_startup", # Set small limit for the test - config_lines=["neon.max_cluster_size=30MB"], + config_lines=[f"neon.max_cluster_size={size_limit_mb}MB"], ) endpoint_main.start() @@ -166,17 +167,39 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): # Insert many rows. This query must fail because of space limit try: - for _i in range(5000): - cur.execute( - """ - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 100) g - """ - ) - # If we get here, the timeline size limit failed - log.error("Query unexpectedly succeeded") + def write_rows(count): + for _i in range(count): + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100) g + """ + ) + + # Write some data that exceeds limit, then let the pageserver ingest it to guarantee that some feedback has made it to + # the safekeeper, then try to write some more. We expect either the initial writes or the ones after + # the wait_for_last_flush_lsn to generate an exception. + # + # Without the wait_for_last_flush_lsn, the size limit sometimes isn't enforced (see https://github.com/neondatabase/neon/issues/6562) + write_rows(2500) + wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id) + logical_size = env.pageserver.http_client().timeline_detail( + env.initial_tenant, new_timeline_id + )["current_logical_size"] + assert logical_size > size_limit_mb * 1024 * 1024 + write_rows(2500) + + # If we get here, the timeline size limit failed. Find out from the pageserver how large it + # thinks the timeline is. + wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id) + logical_size = env.pageserver.http_client().timeline_detail( + env.initial_tenant, new_timeline_id + )["current_logical_size"] + log.error( + f"Query unexpectedly succeeded, pageserver logical size is {logical_size}" + ) raise AssertionError() except psycopg2.errors.DiskFull as err: @@ -841,7 +864,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # Detaching a stuck tenant should proceed promptly # (reproducer for https://github.com/neondatabase/neon/pull/6430) - env.pageserver.http_client().tenant_detach(detach_tenant_id, timeout_secs=10) + env.pageserver.http_client().tenant_detach(detach_tenant_id) tenant_ids.remove(detach_tenant_id) # FIXME: currently the mechanism for cancelling attach is to set state to broken, which is reported spuriously at error level env.pageserver.allowed_errors.append( @@ -864,39 +887,33 @@ def delete_lazy_activating( ): pageserver_http = pageserver.http_client() - # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating - # logical size is paused in a failpoint. So instead we will use a log observation to check that - # on-demand activation was triggered by the tenant deletion - log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*" - if expect_attaching: assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching" with concurrent.futures.ThreadPoolExecutor() as executor: log.info("Starting background delete") - def activated_on_demand(): - assert pageserver.log_contains(log_match) is not None + def shutting_down(): + assert pageserver.log_contains(".*Waiting for timelines.*") is not None def delete_tenant(): pageserver_http.tenant_delete(delete_tenant_id) background_delete = executor.submit(delete_tenant) - log.info(f"Waiting for activation message '{log_match}'") + # We expect deletion to enter shutdown of the tenant even though it's in the attaching state try: - wait_until(10, 1, activated_on_demand) + # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then + # hang because of our failpoint blocking activation. + wait_until(10, 1, shutting_down) finally: log.info("Clearing failpoint") pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) - # Deletion should complete successfully now that failpoint is unblocked + # Deletion should complete successfully now that failpoint is unblocked and shutdown can complete log.info("Joining background delete") background_delete.result(timeout=10) - # Poll for deletion to complete - wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40) - def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): """ @@ -919,6 +936,9 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline + # just make sure this doesn't hit an assertion + client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True) + # load in some data endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) endpoint.safe_psql_many( @@ -1117,3 +1137,10 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) else: raise RuntimeError(activation_method) + + client.configure_failpoints( + [ + ("timeline-calculate-logical-size-pause", "off"), + ("walreceiver-after-ingest", "off"), + ] + ) diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 225b952e73..7272979c4a 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -62,7 +62,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # Clear the buffer cache, to force the VM page to be re-fetched from # the page server - cur.execute("SELECT clear_buffer_cache()") + endpoint.clear_shared_buffers(cursor=cur) # Check that an index-only scan doesn't see the deleted row. If the # clearing of the VM bit was not replayed correctly, this would incorrectly diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 7bf208db54..19df834b81 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1,4 +1,5 @@ import filecmp +import logging import os import random import shutil @@ -48,7 +49,13 @@ from fixtures.remote_storage import ( ) from fixtures.safekeeper.http import SafekeeperHttpClient from fixtures.safekeeper.utils import are_walreceivers_absent -from fixtures.utils import PropagatingThread, get_dir_size, query_scalar, start_in_background +from fixtures.utils import ( + PropagatingThread, + get_dir_size, + query_scalar, + start_in_background, + wait_until, +) def wait_lsn_force_checkpoint( @@ -62,6 +69,18 @@ def wait_lsn_force_checkpoint( lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") + wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at( + lsn: Lsn, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + pageserver_conn_options = pageserver_conn_options or {} + auth_token = None if "password" in pageserver_conn_options: auth_token = pageserver_conn_options["password"] @@ -146,8 +165,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): last_record_lsn=Lsn(timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: - m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)])) - m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)])) + m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id)))) + m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id)))) for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): # Invariant. May be < when transaction is in progress. @@ -235,6 +254,10 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): assert max(init_m[2].flush_lsns) <= min(final_m[2].flush_lsns) < middle_lsn assert max(init_m[2].commit_lsns) <= min(final_m[2].commit_lsns) < middle_lsn + # Test timeline_list endpoint. + http_cli = env.safekeepers[0].http_client() + assert len(http_cli.timeline_list()) == 3 + # Check that dead minority doesn't prevent the commits: execute insert n_inserts # times, with fault_probability chance of getting a wal acceptor down or up @@ -1277,6 +1300,8 @@ def test_lagging_sk(neon_env_builder: NeonEnvBuilder): # Check that WALs are the same. cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id) + env.stop(immediate=True) + # Smaller version of test_one_sk_down testing peer recovery in isolation: that # it works without compute at all. @@ -1724,7 +1749,10 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # Basic pull_timeline test. -def test_pull_timeline(neon_env_builder: NeonEnvBuilder): +# When live_sk_change is False, compute is restarted to change set of +# safekeepers; otherwise it is live reload. +@pytest.mark.parametrize("live_sk_change", [False, True]) +def test_pull_timeline(neon_env_builder: NeonEnvBuilder, live_sk_change: bool): neon_env_builder.auth_enabled = True def execute_payload(endpoint: Endpoint): @@ -1757,8 +1785,7 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() endpoint = env.endpoints.create("main") - endpoint.active_safekeepers = [1, 2, 3] - endpoint.start() + endpoint.start(safekeepers=[1, 2, 3]) execute_payload(endpoint) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -1770,29 +1797,22 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): log.info("Initialize new safekeeper 4, pull data from 1 & 3") env.safekeepers[3].start() - res = ( - env.safekeepers[3] - .http_client(auth_token=env.auth_keys.generate_safekeeper_token()) - .pull_timeline( - { - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - "http_hosts": [ - f"http://localhost:{env.safekeepers[0].port.http}", - f"http://localhost:{env.safekeepers[2].port.http}", - ], - } - ) + res = env.safekeepers[3].pull_timeline( + [env.safekeepers[0], env.safekeepers[2]], tenant_id, timeline_id ) log.info("Finished pulling timeline") log.info(res) show_statuses(env.safekeepers, tenant_id, timeline_id) - log.info("Restarting compute with new config to verify that it works") - endpoint.stop_and_destroy().create("main") - endpoint.active_safekeepers = [1, 3, 4] - endpoint.start() + action = "reconfiguing" if live_sk_change else "restarting" + log.info(f"{action} compute with new config to verify that it works") + new_sks = [1, 3, 4] + if not live_sk_change: + endpoint.stop_and_destroy().create("main") + endpoint.start(safekeepers=new_sks) + else: + endpoint.reconfigure(safekeepers=new_sks) execute_payload(endpoint) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -2069,6 +2089,11 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int): log.info(f"Original digest: {orig_digest}") for sk in env.safekeepers: + wait( + partial(is_flush_lsn_caught_up, sk, tenant_id, timeline_id, lsn), + f"sk_id={sk.id} to flush {lsn}", + ) + sk.http_client().copy_timeline( tenant_id, timeline_id, @@ -2158,7 +2183,7 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder): # generate some data to commit WAL on safekeepers endpoint.safe_psql("insert into t select generate_series(1,100), 'action'") # clear the buffers - endpoint.safe_psql("select clear_buffer_cache()") + endpoint.clear_shared_buffers() # read data to fetch pages from pageserver endpoint.safe_psql("select sum(i) from t") @@ -2178,3 +2203,263 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder): do_something() do_something() + + +# Test creates 5 endpoints and tries to wake them up randomly. All timeouts are +# configured to be very short, so that we expect that: +# - pageserver will update remote_consistent_lsn very often +# - safekeepers will upload partial WAL segments very often +# - safekeeper will try to evict and unevict timelines +# +# Test checks that there are no critical errors while doing this. Also it checks +# that every safekeeper has at least one successful eviction. +@pytest.mark.parametrize("delete_offloaded_wal", [False, True]) +@pytest.mark.parametrize("restart_chance", [0.0, 0.2]) +def test_s3_eviction( + neon_env_builder: NeonEnvBuilder, delete_offloaded_wal: bool, restart_chance: float +): + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) + + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--partial-backup-timeout", + "50ms", + "--control-file-save-interval", + "1s", + # Safekeepers usually wait a while before evicting something: for this test we want them to + # evict things as soon as they are inactive. + "--eviction-min-resident=100ms", + ] + if delete_offloaded_wal: + neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal") + + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_timeout": "100ms", + } + ) + + n_timelines = 5 + + branch_names = [f"branch{tlin}" for tlin in range(n_timelines)] + timelines = [] + ps_client = env.pageservers[0].http_client() + + # start postgres on each timeline + endpoints: list[Endpoint] = [] + for branch_name in branch_names: + timeline_id = env.neon_cli.create_branch(branch_name) + timelines.append(timeline_id) + + endpoints.append(env.endpoints.create_start(branch_name)) + endpoints[-1].safe_psql("CREATE TABLE t(i int)") + endpoints[-1].safe_psql("INSERT INTO t VALUES (0)") + + lsn = endpoints[-1].safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0] + log.info(f"{branch_name}: LSN={lsn}") + + endpoints[-1].stop() + + # update remote_consistent_lsn on pageserver + ps_client.timeline_checkpoint(env.initial_tenant, timelines[-1], wait_until_uploaded=True) + + check_values = [0] * n_timelines + + event_metrics_seen = False + + n_iters = 20 + for _ in range(n_iters): + if log.isEnabledFor(logging.DEBUG): + for j in range(n_timelines): + detail = ps_client.timeline_detail(env.initial_tenant, timelines[j]) + log.debug( + f'{branch_names[j]}: RCL={detail["remote_consistent_lsn"]}, LRL={detail["last_record_lsn"]}' + ) + + i = random.randint(0, n_timelines - 1) + log.info(f"Starting endpoint {i}") + endpoints[i].start() + check_values[i] += 1 + res = endpoints[i].safe_psql("UPDATE t SET i = i + 1 RETURNING i") + assert res[0][0] == check_values[i] + + lsn = endpoints[i].safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0] + log.info(f"{branch_names[i]}: LSN={lsn}") + + endpoints[i].stop() + + # update remote_consistent_lsn on pageserver + ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True) + + # Do metrics check before restarts, since these will reset to zero across a restart + event_metrics_seen |= any( + sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "restore"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "restore"} + ) + or 0 > 0 + for sk in env.safekeepers + ) + + # restarting random safekeepers + for sk in env.safekeepers: + if random.random() < restart_chance: + sk.stop().start() + time.sleep(0.5) + + # require at least one successful eviction in at least one safekeeper + # TODO: require eviction in each safekeeper after https://github.com/neondatabase/neon/issues/8148 is fixed + assert any( + sk.log_contains("successfully evicted timeline") + and sk.log_contains("successfully restored evicted timeline") + for sk in env.safekeepers + ) + + assert event_metrics_seen + + +def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilder): + """ + Verify that pulling timeline from a SK with an uploaded partial segment + does not lead to consistency issues: + 1. Start 3 SKs - only use two + 2. Ingest a bit of WAL + 3. Wait for partial to be uploaded + 4. Pull timeline to the third SK + 6. Replace source with destination SK and start compute + 5. Wait for source SK to evict timeline + 6. Go back to initial compute SK config and validate that + source SK can unevict the timeline (S3 state is consistent) + """ + neon_env_builder.auth_enabled = True + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--delete-offloaded-wal", + "--partial-backup-timeout", + "500ms", + "--control-file-save-interval", + "500ms", + "--eviction-min-resident=500ms", + ] + + env = neon_env_builder.init_start(initial_tenant_conf={"checkpoint_timeout": "100ms"}) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + + log.info("use only first 2 safekeepers, 3rd will be seeded") + endpoint = env.endpoints.create("main") + endpoint.active_safekeepers = [1, 2] + endpoint.start() + endpoint.safe_psql("create table t(key int, value text)") + endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'") + + endpoint.stop() + + def source_partial_segment_uploaded(): + first_segment_name = "000000010000000000000001" + segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + + candidate_seg = None + for seg in segs: + if "partial" in seg and "sk1" in seg and not seg.startswith(first_segment_name): + candidate_seg = seg + + if candidate_seg is not None: + # The term might change, causing the segment to be gc-ed shortly after, + # so give it a bit of time to make sure it's stable. + time.sleep(2) + + segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + assert candidate_seg in segs + return candidate_seg + + raise Exception("Partial segment not uploaded yet") + + source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded) + log.info( + f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + log.info(f"Tracking source partial segment: {source_partial_segment}") + + src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id) + log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}") + + pageserver_conn_options = {"password": env.auth_keys.generate_tenant_token(tenant_id)} + wait_lsn_force_checkpoint_at( + src_flush_lsn, tenant_id, timeline_id, env.pageserver, pageserver_conn_options + ) + + dst_sk.pull_timeline([src_sk], tenant_id, timeline_id) + + def evicted(): + evictions = src_sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "evict"} + ) + + if evictions is None or evictions == 0: + raise Exception("Eviction did not happen on source safekeeper yet") + + wait_until(30, 1, evicted) + + endpoint.start(safekeepers=[2, 3]) + + def new_partial_segment_uploaded(): + segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + for seg in segs: + if "partial" in seg and "sk3" in seg: + return seg + + raise Exception("Partial segment not uploaded yet") + + log.info( + f"Uploaded segments before post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + + endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'") + wait_until(15, 1, new_partial_segment_uploaded) + + log.info( + f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + + # Allow for some gc iterations to happen and assert that the original + # uploaded partial segment remains in place. + time.sleep(5) + segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + assert source_partial_segment in segs + + log.info( + f"Uploaded segments at the end are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + + # Restart the endpoint in order to check that the source safekeeper + # can unevict the timeline + endpoint.stop() + endpoint.start(safekeepers=[1, 2]) + + def unevicted(): + unevictions = src_sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "restore"} + ) + + if unevictions is None or unevictions == 0: + raise Exception("Uneviction did not happen on source safekeeper yet") + + wait_until(10, 1, unevicted) diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 971fad787a..3f0a4a2ff8 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -200,9 +200,8 @@ async def run_restarts_under_load( # assert that at least one transaction has completed in every worker stats.check_progress() - # testing #6530, temporary here - # TODO: remove afer partial backup is enabled by default - victim.start(extra_opts=["--partial-backup-enabled", "--partial-backup-timeout=2s"]) + # testing #6530 + victim.start(extra_opts=["--partial-backup-timeout=2s"]) log.info("Iterations are finished, exiting coroutines...") stats.running = False diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index ad37807dba..375cfcb4fe 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -37,7 +37,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}", ): - pageserver_http.tenant_detach(tenant_id) + pageserver_http.tenant_status(tenant_id) # create new nenant tenant_id, _ = env.neon_cli.create_tenant() diff --git a/trace/Cargo.toml b/trace/Cargo.toml deleted file mode 100644 index d6eed3f49c..0000000000 --- a/trace/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] -name = "trace" -version = "0.1.0" -edition.workspace = true -license.workspace = true - -[dependencies] -clap.workspace = true -anyhow.workspace = true - -pageserver_api.workspace = true -utils.workspace = true -workspace_hack.workspace = true diff --git a/trace/src/main.rs b/trace/src/main.rs deleted file mode 100644 index 049f922b6f..0000000000 --- a/trace/src/main.rs +++ /dev/null @@ -1,175 +0,0 @@ -//! A tool for working with read traces generated by the pageserver. -use std::collections::HashMap; -use std::path::PathBuf; -use std::str::FromStr; -use std::{ - fs::{read_dir, File}, - io::BufReader, -}; - -use pageserver_api::models::{ - PagestreamFeMessage, PagestreamGetPageRequest, PagestreamProtocolVersion, -}; -use utils::id::{ConnectionId, TenantId, TimelineId}; - -use clap::{Parser, Subcommand}; - -/// Utils for working with pageserver read traces. For generating -/// traces, see the `trace_read_requests` tenant config option. -#[derive(Parser, Debug)] -#[command(author, version, about, long_about = None)] -struct Args { - /// Path of trace directory - #[arg(short, long)] - path: PathBuf, - - #[command(subcommand)] - command: Command, -} - -/// What to do with the read trace -#[derive(Subcommand, Debug)] -enum Command { - /// List traces in the directory - List, - - /// Print the traces in text format - Dump, - - /// Print stats and anomalies about the traces - Analyze, - - /// Draw the traces in svg format - Draw, - - /// Send the read requests to a pageserver - Replay, -} - -// HACK This function will change and improve as we see what kind of analysis is useful. -// Currently it collects the difference in blkno of consecutive GetPage requests, -// and counts the frequency of each value. This information is useful in order to: -// - see how sequential a workload is by seeing how often the delta is 1 -// - detect any prefetching anomalies by looking for negative deltas during seqscan -fn analyze_trace(mut reader: R) { - let mut total = 0; // Total requests traced - let mut cross_rel = 0; // Requests that ask for different rel than previous request - let mut deltas = HashMap::::new(); // Consecutive blkno differences - let mut prev: Option = None; - - // Compute stats - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) { - match msg { - PagestreamFeMessage::Exists(_) => {} - PagestreamFeMessage::Nblocks(_) => {} - PagestreamFeMessage::GetSlruSegment(_) => {} - PagestreamFeMessage::GetPage(req) => { - total += 1; - - if let Some(prev) = prev { - if prev.rel == req.rel { - let delta = (req.blkno as i32) - (prev.blkno as i32); - deltas.entry(delta).and_modify(|c| *c += 1).or_insert(1); - } else { - cross_rel += 1; - } - } - prev = Some(req); - } - PagestreamFeMessage::DbSize(_) => {} - }; - } - - // Print stats. - let mut other = deltas.len(); - deltas.retain(|_, count| *count > 300); - other -= deltas.len(); - dbg!(total); - dbg!(cross_rel); - dbg!(other); - dbg!(deltas); -} - -fn dump_trace(mut reader: R) { - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) { - println!("{msg:?}"); - } -} - -#[derive(Debug)] -struct TraceFile { - #[allow(dead_code)] - pub tenant_id: TenantId, - - #[allow(dead_code)] - pub timeline_id: TimelineId, - - #[allow(dead_code)] - pub connection_id: ConnectionId, - - pub path: PathBuf, -} - -fn get_trace_files(traces_dir: &PathBuf) -> anyhow::Result> { - let mut trace_files = Vec::::new(); - - // Trace files are organized as {tenant_id}/{timeline_id}/{connection_id} - for tenant_dir in read_dir(traces_dir)? { - let entry = tenant_dir?; - let path = entry.path(); - let tenant_id = TenantId::from_str(path.file_name().unwrap().to_str().unwrap())?; - - for timeline_dir in read_dir(path)? { - let entry = timeline_dir?; - let path = entry.path(); - let timeline_id = TimelineId::from_str(path.file_name().unwrap().to_str().unwrap())?; - - for trace_dir in read_dir(path)? { - let entry = trace_dir?; - let path = entry.path(); - let connection_id = - ConnectionId::from_str(path.file_name().unwrap().to_str().unwrap())?; - - trace_files.push(TraceFile { - tenant_id, - timeline_id, - connection_id, - path, - }); - } - } - } - - Ok(trace_files) -} - -fn main() -> anyhow::Result<()> { - let args = Args::parse(); - - match args.command { - Command::List => { - for trace_file in get_trace_files(&args.path)? { - println!("{trace_file:?}"); - } - } - Command::Dump => { - for trace_file in get_trace_files(&args.path)? { - let file = File::open(trace_file.path.clone())?; - let reader = BufReader::new(file); - dump_trace(reader); - } - } - Command::Analyze => { - for trace_file in get_trace_files(&args.path)? { - println!("analyzing {trace_file:?}"); - let file = File::open(trace_file.path.clone())?; - let reader = BufReader::new(file); - analyze_trace(reader); - } - } - Command::Draw => todo!(), - Command::Replay => todo!(), - } - - Ok(()) -} diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 4c51945a61..3fd7a45f8a 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba +Subproject commit 3fd7a45f8aae85c080df6329e3c85887b7f3a737 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index e22098d86d..46b4b235f3 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit e22098d86d6c40276b6bd75c29133a33fb283ab6 +Subproject commit 46b4b235f38413ab5974bb22c022f9b829257674 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 9837db1578..47a9122a5a 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 9837db157837fcf43ef7348be0017d3a2238cd27 +Subproject commit 47a9122a5a150a3217fafd3f3d4fe8e020ea718a diff --git a/vendor/revisions.json b/vendor/revisions.json index f945ea6d73..6e3e489b5d 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,14 @@ { - "v16": ["16.3", "9837db157837fcf43ef7348be0017d3a2238cd27"], - "v15": ["15.7", "e22098d86d6c40276b6bd75c29133a33fb283ab6"], - "v14": ["14.12", "4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba"] + "v16": [ + "16.3", + "47a9122a5a150a3217fafd3f3d4fe8e020ea718a" + ], + "v15": [ + "15.7", + "46b4b235f38413ab5974bb22c022f9b829257674" + ], + "v14": [ + "14.12", + "3fd7a45f8aae85c080df6329e3c85887b7f3a737" + ] } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 3c446ecdea..622004b931 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -236,6 +236,7 @@ files: query: | select sum(pg_database_size(datname)) as total from pg_database; + # DEPRECATED - metric_name: lfc_approximate_working_set_size type: gauge help: 'Approximate working set size in pages of 8192 bytes' @@ -244,7 +245,21 @@ files: query: | select neon.approximate_working_set_size(false) as approximate_working_set_size; - - metric_name: current_lsn + - metric_name: lfc_approximate_working_set_size_windows + type: gauge + help: 'Approximate working set size in pages of 8192 bytes' + key_labels: [duration] + values: [size] + # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection + # of durations in a pretty-printed form. + query: | + select + x as duration, + neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size + from + (values ('5m'),('15m'),('1h')) as t (x); + + - metric_name: compute_current_lsn type: gauge help: 'Current LSN of the database' key_labels: @@ -257,13 +272,30 @@ files: else (pg_current_wal_lsn() - '0/0')::FLOAT8 end as lsn; + - metric_name: compute_receive_lsn + type: gauge + help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication' + key_labels: + values: [lsn] + query: | + SELECT + CASE + WHEN pg_catalog.pg_is_in_recovery() + THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 + ELSE 0 + END AS lsn; + - metric_name: replication_delay_bytes type: gauge help: 'Bytes between received and replayed LSN' key_labels: values: [replication_delay_bytes] + # We use a GREATEST call here because this calculation can be negative. + # The calculation is not atomic, meaning after we've gotten the receive + # LSN, the replay LSN may have advanced past the receive LSN we + # are using for the calculation. query: | - SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes; + SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; - metric_name: replication_delay_seconds type: gauge @@ -293,6 +325,22 @@ files: query: | SELECT checkpoints_timed FROM pg_stat_bgwriter; + - metric_name: compute_logical_snapshot_files + type: guage + help: 'Number of snapshot files in pg_logical/snapshot' + key_labels: + - tenant_id + - timeline_id + values: [num_logical_snapshot_files] + query: | + SELECT + (SELECT setting FROM pg_settings WHERE name = 'neon.tenant_id') AS tenant_id, + (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These + -- temporary snapshot files are renamed to the actual snapshot files after they are + -- completely built. We only WAL-log the completely built snapshot files. + (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files; + # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. @@ -377,13 +425,19 @@ files: query: | select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - - metric_name: lfc_approximate_working_set_size + - metric_name: lfc_approximate_working_set_size_windows type: gauge help: 'Approximate working set size in pages of 8192 bytes' - key_labels: - values: [approximate_working_set_size] + key_labels: [duration_seconds] + values: [size] + # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set + # size looking back 1..60 minutes, labeled with the number of minutes. query: | - select neon.approximate_working_set_size(false) as approximate_working_set_size; + select + x::text as duration_seconds, + neon.approximate_working_set_size_seconds(x) as size + from + (select generate_series * 60 as x from generate_series(1, 60)) as t (x); build: | # Build cgroup-tools # @@ -391,7 +445,7 @@ build: | # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor # requires cgroup v2, so we'll build cgroup-tools ourselves. FROM debian:bullseye-slim as libcgroup-builder - ENV LIBCGROUP_VERSION v2.0.3 + ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ && apt update \ @@ -435,7 +489,7 @@ build: | pkg-config # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) - ENV PGBOUNCER_TAG pgbouncer_1_22_1 + ENV PGBOUNCER_TAG=pgbouncer_1_22_1 RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index df16c71789..20693ad63d 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -25,18 +25,22 @@ axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } +camino = { version = "1", default-features = false, features = ["serde1"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } clap = { version = "4", features = ["derive", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } +crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] } +der = { version = "0.7", default-features = false, features = ["oid", "pem", "std"] } +deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] } +digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } futures-channel = { version = "0.3", features = ["sink"] } -futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } -futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } +generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } @@ -44,6 +48,7 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } +lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } @@ -64,23 +69,24 @@ rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } -sha2 = { version = "0.10", features = ["asm"] } +sha2 = { version = "0.10", features = ["asm", "oid"] } +signature = { version = "2", default-features = false, features = ["digest", "rand_core", "std"] } smallvec = { version = "1", default-features = false, features = ["const_new", "write"] } +spki = { version = "0.7", default-features = false, features = ["pem", "std"] } subtle = { version = "2" } sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } +tikv-jemalloc-sys = { version = "0.5" } time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } -toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } -toml_edit = { version = "0.19", features = ["serde"] } tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } -zeroize = { version = "1", features = ["derive"] } +zeroize = { version = "1", features = ["derive", "serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } @@ -96,6 +102,7 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] } hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } +lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } @@ -105,7 +112,9 @@ num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } +proc-macro2 = { version = "1" } prost = { version = "0.11" } +quote = { version = "1" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } @@ -113,8 +122,6 @@ serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } -toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } -toml_edit = { version = "0.19", features = ["serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }