diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..6ba6b3c887 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# allows for nicer hunk headers with git show +*.rs diff=rust diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index daaedf6d11..9d39ab6ad7 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -131,8 +131,8 @@ runs: exit 1 fi if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then - # -n16 uses sixteen processes to run tests via pytest-xdist - EXTRA_PARAMS="-n16 $EXTRA_PARAMS" + # -n sets the number of parallel processes that pytest-xdist will run + EXTRA_PARAMS="-n12 $EXTRA_PARAMS" # --dist=loadgroup points tests marked with @pytest.mark.xdist_group # to the same worker to make @pytest.mark.order work with xdist diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml new file mode 100644 index 0000000000..26e234a04d --- /dev/null +++ b/.github/workflows/_build-and-test-locally.yml @@ -0,0 +1,288 @@ +name: Build and Test Locally + +on: + workflow_call: + inputs: + arch: + description: 'x64 or arm64' + required: true + type: string + build-tag: + description: 'build tag' + required: true + type: string + build-tools-image: + description: 'build-tools image' + required: true + type: string + build-type: + description: 'debug or release' + required: true + type: string + +defaults: + run: + shell: bash -euxo pipefail {0} + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + +jobs: + build-neon: + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # Raise locked memory limit for tokio-epoll-uring. + # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), + # io_uring will account the memory of the CQ and SQ as locked. + # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + env: + BUILD_TYPE: ${{ inputs.build-type }} + GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} + BUILD_TAG: ${{ inputs.build-tag }} + + steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done + + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Set pg 14 revision for caching + id: pg_v14_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT + + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + + - name: Set pg 16 revision for caching + id: pg_v16_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + + # Set some environment variables used by all the steps. + # + # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. + # It also includes --features, if any + # + # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, + # because "cargo metadata" doesn't accept --release or --debug options + # + # We run tests with addtional features, that are turned off by default (e.g. in release builds), see + # corresponding Cargo.toml files for their descriptions. + - name: Set env variables + run: | + CARGO_FEATURES="--features testing" + if [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" + CARGO_FLAGS="--locked" + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix="" + CARGO_FLAGS="--locked --release" + fi + { + echo "cov_prefix=${cov_prefix}" + echo "CARGO_FEATURES=${CARGO_FEATURES}" + echo "CARGO_FLAGS=${CARGO_FLAGS}" + echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" + } >> $GITHUB_ENV + + - name: Cache postgres v14 build + id: cache_pg_14 + uses: actions/cache@v4 + with: + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v4 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Cache postgres v16 build + id: cache_pg_16 + uses: actions/cache@v4 + with: + path: pg_install/v16 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: mold -run make postgres-v14 -j$(nproc) + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: mold -run make postgres-v15 -j$(nproc) + + - name: Build postgres v16 + if: steps.cache_pg_16.outputs.cache-hit != 'true' + run: mold -run make postgres-v16 -j$(nproc) + + - name: Build neon extensions + run: mold -run make neon-pg-ext -j$(nproc) + + - name: Build walproposer-lib + run: mold -run make walproposer-lib -j$(nproc) + + - name: Run cargo build + run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests + + # Do install *before* running rust tests because they might recompile the + # binaries with different features/flags. + - name: Install rust binaries + run: | + # Install target binaries + mkdir -p /tmp/neon/bin/ + binaries=$( + ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | + jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' + ) + for bin in $binaries; do + SRC=target/$BUILD_TYPE/$bin + DST=/tmp/neon/bin/$bin + cp "$SRC" "$DST" + done + + # Install test executables and write list of all binaries (for code coverage) + if [[ $BUILD_TYPE == "debug" ]]; then + # Keep bloated coverage data files away from the rest of the artifact + mkdir -p /tmp/coverage/ + + mkdir -p /tmp/neon/test_bin/ + + test_exe_paths=$( + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | + jq -r '.executable | select(. != null)' + ) + for bin in $test_exe_paths; do + SRC=$bin + DST=/tmp/neon/test_bin/$(basename $bin) + + # We don't need debug symbols for code coverage, so strip them out to make + # the artifact smaller. + strip "$SRC" -o "$DST" + echo "$DST" >> /tmp/coverage/binaries.list + done + + for bin in $binaries; do + echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list + done + fi + + - name: Run rust tests + env: + NEXTEST_RETRIES: 3 + run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR + LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib + export LD_LIBRARY_PATH + + #nextest does not yet support running doctests + cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + + for io_engine in std-fs tokio-epoll-uring ; do + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES + done + + # Run separate tests for real S3 + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests + export REMOTE_STORAGE_S3_REGION=eu-central-1 + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)' + + # Run separate tests for real Azure Blob Storage + # XXX: replace region with `eu-central-1`-like region + export ENABLE_REAL_AZURE_REMOTE_STORAGE=y + export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" + export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" + export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" + export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)' + + - name: Install postgres binaries + run: cp -a pg_install /tmp/neon/pg_install + + - name: Upload Neon artifact + uses: ./.github/actions/upload + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact + path: /tmp/neon + + # XXX: keep this after the binaries.list is formed, so the coverage can properly work later + - name: Merge and upload coverage data + if: inputs.build-type == 'debug' + uses: ./.github/actions/save-coverage-data + + regress-tests: + # Run test on x64 only + if: inputs.arch == 'x64' + needs: [ build-neon ] + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # for changed limits, see comments on `options:` earlier in this file + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + strategy: + fail-fast: false + matrix: + pg_version: [ v14, v15, v16 ] + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Pytest regression tests + uses: ./.github/actions/run-python-test-set + timeout-minutes: 60 + with: + build_type: ${{ inputs.build-type }} + test_selection: regress + needs_postgres_source: true + run_with_real_s3: true + real_s3_bucket: neon-github-ci-tests + real_s3_region: eu-central-1 + rerun_flaky: true + pg_version: ${{ matrix.pg_version }} + env: + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + CHECK_ONDISK_DATA_COMPATIBILITY: nonempty + BUILD_TAG: ${{ inputs.build-tag }} + PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + + # Temporary disable this step until we figure out why it's so flaky + # Ref https://github.com/neondatabase/neon/issues/4540 + - name: Merge and upload coverage data + if: | + false && + inputs.build-type == 'debug' && matrix.pg_version == 'v14' + uses: ./.github/actions/save-coverage-data diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index c132b5b513..5ffdb29fe6 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -261,8 +261,7 @@ jobs: }' if [ "$(date +%A)" = "Saturday" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}, - { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "50gb"}]') + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index a69686bf2a..76fc58151a 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -72,6 +72,12 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} + - uses: docker/build-push-action@v6 with: context: . @@ -79,8 +85,8 @@ jobs: push: true pull: true file: Dockerfile.build-tools - cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/build-tools:cache-{0},mode=max', matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }} tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} - name: Remove custom docker config directory diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index cb7655e039..872c1fbb39 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -125,7 +125,11 @@ jobs: check-codestyle-rust: needs: [ check-permissions, build-build-tools-image ] - runs-on: [ self-hosted, gen3, small ] + strategy: + matrix: + arch: [ x64, arm64 ] + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -193,291 +197,27 @@ jobs: if: ${{ !cancelled() }} run: cargo deny check --hide-inclusion-graph - build-neon: - needs: [ check-permissions, tag, build-build-tools-image ] - runs-on: [ self-hosted, gen3, large ] - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - # Raise locked memory limit for tokio-epoll-uring. - # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), - # io_uring will account the memory of the CQ and SQ as locked. - # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 - options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + build-and-test-locally: + needs: [ tag, build-build-tools-image ] strategy: fail-fast: false matrix: - build_type: [ debug, release ] - env: - BUILD_TYPE: ${{ matrix.build_type }} - GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG: ${{ needs.tag.outputs.build-tag }} - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - - name: Set pg 14 revision for caching - id: pg_v14_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - - - name: Set pg 15 revision for caching - id: pg_v15_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - - - name: Set pg 16 revision for caching - id: pg_v16_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT - - # Set some environment variables used by all the steps. - # - # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. - # It also includes --features, if any - # - # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, - # because "cargo metadata" doesn't accept --release or --debug options - # - # We run tests with addtional features, that are turned off by default (e.g. in release builds), see - # corresponding Cargo.toml files for their descriptions. - - name: Set env variables - run: | - CARGO_FEATURES="--features testing" - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FLAGS="--locked" - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix="" - CARGO_FLAGS="--locked --release" - fi - { - echo "cov_prefix=${cov_prefix}" - echo "CARGO_FEATURES=${CARGO_FEATURES}" - echo "CARGO_FLAGS=${CARGO_FLAGS}" - echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" - } >> $GITHUB_ENV - - # Disabled for now - # Don't include the ~/.cargo/registry/src directory. It contains just - # uncompressed versions of the crates in ~/.cargo/registry/cache - # directory, and it's faster to let 'cargo' to rebuild it from the - # compressed crates. -# - name: Cache cargo deps -# id: cache_cargo -# uses: actions/cache@v4 -# with: -# path: | -# ~/.cargo/registry/ -# !~/.cargo/registry/src -# ~/.cargo/git/ -# target/ -# # Fall back to older versions of the key, if no cache for current Cargo.lock was found -# key: | -# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} -# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}- - - - name: Cache postgres v14 build - id: cache_pg_14 - uses: actions/cache@v4 - with: - path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - - - name: Cache postgres v15 build - id: cache_pg_15 - uses: actions/cache@v4 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - - - name: Cache postgres v16 build - id: cache_pg_16 - uses: actions/cache@v4 - with: - path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make postgres-v14 -j$(nproc) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make postgres-v15 -j$(nproc) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make postgres-v16 -j$(nproc) - - - name: Build neon extensions - run: mold -run make neon-pg-ext -j$(nproc) - - - name: Build walproposer-lib - run: mold -run make walproposer-lib -j$(nproc) - - - name: Run cargo build - run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR - ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - - # Do install *before* running rust tests because they might recompile the - # binaries with different features/flags. - - name: Install rust binaries - run: | - # Install target binaries - mkdir -p /tmp/neon/bin/ - binaries=$( - ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | - jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' - ) - for bin in $binaries; do - SRC=target/$BUILD_TYPE/$bin - DST=/tmp/neon/bin/$bin - cp "$SRC" "$DST" - done - - # Install test executables and write list of all binaries (for code coverage) - if [[ $BUILD_TYPE == "debug" ]]; then - # Keep bloated coverage data files away from the rest of the artifact - mkdir -p /tmp/coverage/ - - mkdir -p /tmp/neon/test_bin/ - - test_exe_paths=$( - ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | - jq -r '.executable | select(. != null)' - ) - for bin in $test_exe_paths; do - SRC=$bin - DST=/tmp/neon/test_bin/$(basename $bin) - - # We don't need debug symbols for code coverage, so strip them out to make - # the artifact smaller. - strip "$SRC" -o "$DST" - echo "$DST" >> /tmp/coverage/binaries.list - done - - for bin in $binaries; do - echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list - done - fi - - - name: Run rust tests - env: - NEXTEST_RETRIES: 3 - run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR - LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib - export LD_LIBRARY_PATH - - #nextest does not yet support running doctests - cargo test --doc $CARGO_FLAGS $CARGO_FEATURES - - for io_engine in std-fs tokio-epoll-uring ; do - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES - done - - # Run separate tests for real S3 - export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests - export REMOTE_STORAGE_S3_REGION=eu-central-1 - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)' - - # Run separate tests for real Azure Blob Storage - # XXX: replace region with `eu-central-1`-like region - export ENABLE_REAL_AZURE_REMOTE_STORAGE=y - export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" - export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" - export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" - export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)' - - - name: Install postgres binaries - run: cp -a pg_install /tmp/neon/pg_install - - - name: Upload Neon artifact - uses: ./.github/actions/upload - with: - name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact - path: /tmp/neon - - # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - - name: Merge and upload coverage data - if: matrix.build_type == 'debug' - uses: ./.github/actions/save-coverage-data - - regress-tests: - needs: [ check-permissions, build-neon, build-build-tools-image, tag ] - runs-on: [ self-hosted, gen3, large ] - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - # for changed limits, see comments on `options:` earlier in this file - options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - pg_version: [ v14, v15, v16 ] - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - - name: Pytest regression tests - uses: ./.github/actions/run-python-test-set - timeout-minutes: 60 - with: - build_type: ${{ matrix.build_type }} - test_selection: regress - needs_postgres_source: true - run_with_real_s3: true - real_s3_bucket: neon-github-ci-tests - real_s3_region: eu-central-1 - rerun_flaky: true - pg_version: ${{ matrix.pg_version }} - env: - TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - CHECK_ONDISK_DATA_COMPATIBILITY: nonempty - BUILD_TAG: ${{ needs.tag.outputs.build-tag }} - PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_IMPL: vectored - PAGESERVER_GET_IMPL: vectored - PAGESERVER_VALIDATE_VEC_GET: true - - # Temporary disable this step until we figure out why it's so flaky - # Ref https://github.com/neondatabase/neon/issues/4540 - - name: Merge and upload coverage data - if: | - false && - matrix.build_type == 'debug' && matrix.pg_version == 'v14' - uses: ./.github/actions/save-coverage-data + arch: [ x64 ] + build-type: [ debug, release ] + include: + - build-type: release + arch: arm64 + uses: ./.github/workflows/_build-and-test-locally.yml + with: + arch: ${{ matrix.arch }} + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }} + build-tag: ${{ needs.tag.outputs.build-tag }} + build-type: ${{ matrix.build-type }} + secrets: inherit + # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking get-benchmarks-durations: + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') outputs: json: ${{ steps.get-benchmark-durations.outputs.json }} needs: [ check-permissions, build-build-tools-image ] @@ -488,7 +228,6 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init - if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') steps: - name: Checkout uses: actions/checkout@v4 @@ -513,7 +252,8 @@ jobs: echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT benchmarks: - needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ] + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') + needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] runs-on: [ self-hosted, gen3, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} @@ -522,7 +262,6 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} # for changed limits, see comments on `options:` earlier in this file options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 - if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: fail-fast: false matrix: @@ -547,9 +286,6 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_IMPL: vectored - PAGESERVER_GET_IMPL: vectored - PAGESERVER_VALIDATE_VEC_GET: false # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -570,7 +306,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} create-test-report: - needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ] + needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} outputs: report-url: ${{ steps.create-allure-report.outputs.report-url }} @@ -621,7 +357,7 @@ jobs: }) coverage-report: - needs: [ check-permissions, regress-tests, build-build-tools-image ] + needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] runs-on: [ self-hosted, gen3, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} @@ -760,6 +496,12 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} + - uses: docker/build-push-action@v6 with: context: . @@ -771,8 +513,8 @@ jobs: push: true pull: true file: Dockerfile - cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }} tags: | neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} @@ -851,6 +593,12 @@ jobs: username: ${{ secrets.AWS_ACCESS_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }} + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} + - name: Build compute-node image uses: docker/build-push-action@v6 with: @@ -864,8 +612,8 @@ jobs: push: true pull: true file: Dockerfile.compute-node - cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} tags: | neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} @@ -884,8 +632,8 @@ jobs: pull: true file: Dockerfile.compute-node target: neon-pg-ext-test - cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} tags: | neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} @@ -1223,7 +971,7 @@ jobs: exit 1 deploy: - needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ] + needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' runs-on: [ self-hosted, gen3, small ] @@ -1324,7 +1072,7 @@ jobs: }) promote-compatibility-data: - needs: [ check-permissions, promote-images, tag, regress-tests ] + needs: [ check-permissions, promote-images, tag, build-and-test-locally ] if: github.ref_name == 'release' runs-on: [ self-hosted, gen3, small ] @@ -1363,7 +1111,7 @@ jobs: done pin-build-tools-image: - needs: [ build-build-tools-image, promote-images, regress-tests ] + needs: [ build-build-tools-image, promote-images, build-and-test-locally ] if: github.ref_name == 'main' uses: ./.github/workflows/pin-build-tools-image.yml with: @@ -1385,7 +1133,7 @@ jobs: needs: - check-codestyle-python - check-codestyle-rust - - regress-tests + - build-and-test-locally - test-images runs-on: ubuntu-22.04 steps: diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 11ff634b6c..d4870e16ad 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -133,221 +133,6 @@ jobs: - name: Check that no warnings are produced run: ./run_clippy.sh - check-linux-arm-build: - needs: [ check-permissions, build-build-tools-image ] - timeout-minutes: 90 - runs-on: [ self-hosted, small-arm64 ] - - env: - # Use release build only, to have less debug info around - # Hence keeping target/ (and general cache size) smaller - BUILD_TYPE: release - CARGO_FEATURES: --features testing - CARGO_FLAGS: --release - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} - - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - options: --init - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - - name: Set pg 14 revision for caching - id: pg_v14_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - - - name: Set pg 15 revision for caching - id: pg_v15_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - - - name: Set pg 16 revision for caching - id: pg_v16_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT - - - name: Set env variables - run: | - echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV - - - name: Cache postgres v14 build - id: cache_pg_14 - uses: actions/cache@v4 - with: - path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v15 build - id: cache_pg_15 - uses: actions/cache@v4 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v16 build - id: cache_pg_16 - uses: actions/cache@v4 - with: - path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make postgres-v14 -j$(nproc) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make postgres-v15 -j$(nproc) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make postgres-v16 -j$(nproc) - - - name: Build neon extensions - run: mold -run make neon-pg-ext -j$(nproc) - - - name: Build walproposer-lib - run: mold -run make walproposer-lib -j$(nproc) - - - name: Run cargo build - run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR - mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc) - - - name: Run cargo test - env: - NEXTEST_RETRIES: 3 - run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR - LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib - export LD_LIBRARY_PATH - - cargo nextest run $CARGO_FEATURES -j$(nproc) - - # Run separate tests for real S3 - export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests - export REMOTE_STORAGE_S3_REGION=eu-central-1 - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc) - - # Run separate tests for real Azure Blob Storage - # XXX: replace region with `eu-central-1`-like region - export ENABLE_REAL_AZURE_REMOTE_STORAGE=y - export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" - export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" - export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" - export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_azure -j$(nproc) - - check-codestyle-rust-arm: - needs: [ check-permissions, build-build-tools-image ] - timeout-minutes: 90 - runs-on: [ self-hosted, small-arm64 ] - - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - options: --init - - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - # Some of our rust modules use FFI and need those to be checked - - name: Get postgres headers - run: make postgres-headers -j$(nproc) - - # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. - # This will catch compiler & clippy warnings in all feature combinations. - # TODO: use cargo hack for build and test as well, but, that's quite expensive. - # NB: keep clippy args in sync with ./run_clippy.sh - - run: | - CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" - if [ "$CLIPPY_COMMON_ARGS" = "" ]; then - echo "No clippy args found in .neon_clippy_args" - exit 1 - fi - echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV - - - name: Run cargo clippy (debug) - if: matrix.build_type == 'debug' - run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS - - name: Run cargo clippy (release) - if: matrix.build_type == 'release' - run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS - - - name: Check documentation generation - if: matrix.build_type == 'release' - run: cargo doc --workspace --no-deps --document-private-items -j$(nproc) - env: - RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" - - # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - - name: Check formatting - if: ${{ !cancelled() && matrix.build_type == 'release' }} - run: cargo fmt --all -- --check - - # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - - name: Check rust dependencies - if: ${{ !cancelled() && matrix.build_type == 'release' }} - run: | - cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date - cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - - # https://github.com/EmbarkStudios/cargo-deny - - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() && matrix.build_type == 'release' }} - run: cargo deny check - gather-rust-build-stats: needs: [ check-permissions, build-build-tools-image ] if: | diff --git a/CODEOWNERS b/CODEOWNERS index af2fa6088e..606dbb4e22 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,13 +1,13 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute /storage_controller @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage -/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers +/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage /libs/remote_storage/ @neondatabase/storage -/libs/safekeeper_api/ @neondatabase/safekeepers +/libs/safekeeper_api/ @neondatabase/storage /libs/vm_monitor/ @neondatabase/autoscaling /pageserver/ @neondatabase/storage /pgxn/ @neondatabase/compute -/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers +/pgxn/neon/ @neondatabase/compute @neondatabase/storage /proxy/ @neondatabase/proxy -/safekeeper/ @neondatabase/safekeepers +/safekeeper/ @neondatabase/storage /vendor/ @neondatabase/compute diff --git a/Cargo.lock b/Cargo.lock index 2505d4d3ed..2b56095bc8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -261,15 +261,6 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba" -[[package]] -name = "atomic-polyfill" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289" -dependencies = [ - "critical-section", -] - [[package]] name = "atomic-take" version = "1.1.0" @@ -1451,12 +1442,6 @@ dependencies = [ "itertools", ] -[[package]] -name = "critical-section" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52" - [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -2282,15 +2267,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "hash32" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606" -dependencies = [ - "byteorder", -] - [[package]] name = "hashbrown" version = "0.12.3" @@ -2339,18 +2315,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "heapless" -version = "0.8.0" -source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963be2de24804acf5e5001#644653bf3b831c6bb4963be2de24804acf5e5001" -dependencies = [ - "atomic-polyfill", - "hash32", - "rustc_version", - "spin 0.9.8", - "stable_deref_trait", -] - [[package]] name = "heck" version = "0.4.1" @@ -2384,16 +2348,6 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" -[[package]] -name = "histogram" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b" -dependencies = [ - "serde", - "thiserror", -] - [[package]] name = "hmac" version = "0.12.1" @@ -4658,6 +4612,7 @@ name = "remote_storage" version = "0.1.0" dependencies = [ "anyhow", + "async-stream", "async-trait", "aws-config", "aws-credential-types", @@ -5700,9 +5655,6 @@ name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -dependencies = [ - "lock_api", -] [[package]] name = "spki" @@ -5724,12 +5676,6 @@ dependencies = [ "der 0.7.8", ] -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - [[package]] name = "static_assertions" version = "1.1.0" @@ -5847,7 +5793,6 @@ dependencies = [ "futures", "futures-util", "hex", - "histogram", "humantime", "itertools", "once_cell", @@ -6827,7 +6772,6 @@ dependencies = [ "criterion", "fail", "futures", - "heapless", "hex", "hex-literal", "humantime", diff --git a/Cargo.toml b/Cargo.toml index 615f5472ec..7749378114 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -204,9 +204,6 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -## Other git libraries -heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending - ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 4826b7914e..dfaab1cb2e 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -192,7 +192,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.79.0 +ENV RUSTC_VERSION=1.80.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ diff --git a/Makefile b/Makefile index 942867d81a..de298303e3 100644 --- a/Makefile +++ b/Makefile @@ -69,6 +69,8 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 # Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel) CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib +CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55" + # # Top level Makefile to build Neon and PostgreSQL # @@ -79,15 +81,24 @@ all: neon postgres neon-pg-ext # # The 'postgres_ffi' depends on the Postgres headers. .PHONY: neon -neon: postgres-headers walproposer-lib +neon: postgres-headers walproposer-lib cargo-target-dir +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) +.PHONY: cargo-target-dir +cargo-target-dir: + # https://github.com/rust-lang/cargo/issues/14281 + mkdir -p target + test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG ### PostgreSQL parts # Some rules are duplicated for Postgres v14 and 15. We may want to refactor # to avoid the duplication in the future, but it's tolerable for now. # $(POSTGRES_INSTALL_DIR)/build/%/config.status: + + mkdir -p $(POSTGRES_INSTALL_DIR) + test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG + +@echo "Configuring Postgres $* build" @test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \ echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \ diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 4bf1b29785..51e9a51a57 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -21,7 +21,9 @@ use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, }; -use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest}; +use pageserver_api::controller_api::{ + NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest, +}; use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo}; use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; @@ -1250,9 +1252,70 @@ async fn handle_start_all( exit(1); } } + + neon_start_status_check(env, retry_timeout).await?; + Ok(()) } +async fn neon_start_status_check( + env: &local_env::LocalEnv, + retry_timeout: &Duration, +) -> anyhow::Result<()> { + const RETRY_INTERVAL: Duration = Duration::from_millis(100); + const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5); + + if env.control_plane_api.is_none() { + return Ok(()); + } + + let storcon = StorageController::from_env(env); + + let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis(); + let notice_after_retries = retry_timeout.as_millis() / NOTICE_AFTER_RETRIES.as_millis(); + + println!("\nRunning neon status check"); + + for retry in 0..retries { + if retry == notice_after_retries { + println!("\nNeon status check has not passed yet, continuing to wait") + } + + let mut passed = true; + let mut nodes = storcon.node_list().await?; + let mut pageservers = env.pageservers.clone(); + + if nodes.len() != pageservers.len() { + continue; + } + + nodes.sort_by_key(|ps| ps.id); + pageservers.sort_by_key(|ps| ps.id); + + for (idx, pageserver) in pageservers.iter().enumerate() { + let node = &nodes[idx]; + if node.id != pageserver.id { + passed = false; + break; + } + + if !matches!(node.availability, NodeAvailabilityWrapper::Active) { + passed = false; + break; + } + } + + if passed { + println!("\nNeon started and passed status check"); + return Ok(()); + } + + tokio::time::sleep(RETRY_INTERVAL).await; + } + + anyhow::bail!("\nNeon passed status check") +} + async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let immediate = sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 3ac3ce21df..505d157efd 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -151,7 +151,10 @@ pub struct NeonBroker { pub struct NeonStorageControllerConf { /// Heartbeat timeout before marking a node offline #[serde(with = "humantime_serde")] - pub max_unavailable: Duration, + pub max_offline: Duration, + + #[serde(with = "humantime_serde")] + pub max_warming_up: Duration, /// Threshold for auto-splitting a tenant into shards pub split_threshold: Option, @@ -159,14 +162,16 @@ pub struct NeonStorageControllerConf { impl NeonStorageControllerConf { // Use a shorter pageserver unavailability interval than the default to speed up tests. - const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = - std::time::Duration::from_secs(10); + const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10); + + const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30); } impl Default for NeonStorageControllerConf { fn default() -> Self { Self { - max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL, + max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL, + max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL, split_threshold: None, } } @@ -509,7 +514,6 @@ impl LocalEnv { #[derive(serde::Serialize, serde::Deserialize)] // (allow unknown fields, unlike PageServerConf) struct PageserverConfigTomlSubset { - id: NodeId, listen_pg_addr: String, listen_http_addr: String, pg_auth_type: AuthType, @@ -521,18 +525,30 @@ impl LocalEnv { .with_context(|| format!("read {:?}", config_toml_path))?, ) .context("parse pageserver.toml")?; + let identity_toml_path = dentry.path().join("identity.toml"); + #[derive(serde::Serialize, serde::Deserialize)] + struct IdentityTomlSubset { + id: NodeId, + } + let identity_toml: IdentityTomlSubset = toml_edit::de::from_str( + &std::fs::read_to_string(&identity_toml_path) + .with_context(|| format!("read {:?}", identity_toml_path))?, + ) + .context("parse identity.toml")?; let PageserverConfigTomlSubset { - id: config_toml_id, listen_pg_addr, listen_http_addr, pg_auth_type, http_auth_type, } = config_toml; + let IdentityTomlSubset { + id: identity_toml_id, + } = identity_toml; let conf = PageServerConf { id: { anyhow::ensure!( - config_toml_id == id, - "id mismatch: config_toml.id={config_toml_id} id={id}", + identity_toml_id == id, + "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}", ); id }, diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index ba4f98d945..399b1c2653 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -127,10 +127,13 @@ impl PageServerNode { } // Apply the user-provided overrides - overrides.push( - toml_edit::ser::to_string_pretty(&conf) - .expect("we deserialized this from toml earlier"), - ); + overrides.push({ + let mut doc = + toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier"); + // `id` is written out to `identity.toml` instead of `pageserver.toml` + doc.remove("id").expect("it's part of the struct"); + doc.to_string() + }); // Turn `overrides` into a toml document. // TODO: above code is legacy code, it should be refactored to use toml_edit directly. diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index d7aedd711a..e054e9ee57 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -5,8 +5,9 @@ use crate::{ use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::{ controller_api::{ - NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse, - TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse, + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest, + TenantShardMigrateResponse, }, models::{ TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, @@ -353,8 +354,10 @@ impl StorageController { "--dev", "--database-url", &database_url, - "--max-unavailable-interval", - &humantime::Duration::from(self.config.max_unavailable).to_string(), + "--max-offline-interval", + &humantime::Duration::from(self.config.max_offline).to_string(), + "--max-warming-up-interval", + &humantime::Duration::from(self.config.max_warming_up).to_string(), ] .into_iter() .map(|s| s.to_string()) @@ -625,6 +628,15 @@ impl StorageController { .await } + pub async fn node_list(&self) -> anyhow::Result> { + self.dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await + } + #[instrument(skip(self))] pub async fn ready(&self) -> anyhow::Result<()> { self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None) diff --git a/docs/rfcs/034-timeline-archive.md b/docs/rfcs/035-timeline-archive.md similarity index 100% rename from docs/rfcs/034-timeline-archive.md rename to docs/rfcs/035-timeline-archive.md diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md index 3acb4e18cb..b6b90d90c2 100644 --- a/docs/synthetic-size.md +++ b/docs/synthetic-size.md @@ -21,9 +21,9 @@ implementation where we keep more data than we would need to, do not change the synthetic size or incur any costs to the user. The synthetic size is calculated for the whole project. It is not -straightforward to attribute size to individual branches. See "What is -the size of an individual branch?" for discussion on those -difficulties. +straightforward to attribute size to individual branches. See [What is +the size of an individual branch?](#what-is-the-size-of-an-individual-branch) +for a discussion of those difficulties. The synthetic size is designed to: @@ -40,8 +40,9 @@ The synthetic size is designed to: - logical size is the size of a branch *at a given point in time*. It's the total size of all tables in all databases, as you see with "\l+" in psql for example, plus the Postgres SLRUs and some - small amount of metadata. NOTE that currently, Neon does not include - the SLRUs and metadata in the logical size. See comment to `get_current_logical_size_non_incremental()`. + small amount of metadata. Note that currently, Neon does not include + the SLRUs and metadata in the logical size. Refer to the comment in + [`get_current_logical_size_non_incremental()`](/pageserver/src/pgdatadir_mapping.rs#L813-L814). - a "point in time" is defined as an LSN value. You can convert a timestamp to an LSN, but the storage internally works with LSNs. diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index d0e1eb6b28..474f796040 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -1,4 +1,5 @@ use std::str::FromStr; +use std::time::Instant; /// Request/response types for the storage controller /// API (`/control/v1` prefix). Implemented by the server @@ -150,11 +151,16 @@ impl UtilizationScore { } } -#[derive(Serialize, Deserialize, Clone, Copy, Debug)] +#[derive(Serialize, Clone, Copy, Debug)] #[serde(into = "NodeAvailabilityWrapper")] pub enum NodeAvailability { // Normal, happy state Active(UtilizationScore), + // Node is warming up, but we expect it to become available soon. Covers + // the time span between the re-attach response being composed on the storage controller + // and the first successful heartbeat after the processing of the re-attach response + // finishes on the pageserver. + WarmingUp(Instant), // Offline: Tenants shouldn't try to attach here, but they may assume that their // secondary locations on this node still exist. Newly added nodes are in this // state until we successfully contact them. @@ -164,7 +170,10 @@ pub enum NodeAvailability { impl PartialEq for NodeAvailability { fn eq(&self, other: &Self) -> bool { use NodeAvailability::*; - matches!((self, other), (Active(_), Active(_)) | (Offline, Offline)) + matches!( + (self, other), + (Active(_), Active(_)) | (Offline, Offline) | (WarmingUp(_), WarmingUp(_)) + ) } } @@ -176,6 +185,7 @@ impl Eq for NodeAvailability {} #[derive(Serialize, Deserialize, Clone, Copy, Debug)] pub enum NodeAvailabilityWrapper { Active, + WarmingUp, Offline, } @@ -185,6 +195,7 @@ impl From for NodeAvailability { // Assume the worst utilisation score to begin with. It will later be updated by // the heartbeats. NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()), + NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()), NodeAvailabilityWrapper::Offline => NodeAvailability::Offline, } } @@ -194,6 +205,7 @@ impl From for NodeAvailabilityWrapper { fn from(val: NodeAvailability) -> Self { match val { NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active, + NodeAvailability::WarmingUp(_) => NodeAvailabilityWrapper::WarmingUp, NodeAvailability::Offline => NodeAvailabilityWrapper::Offline, } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 231a604b47..591c45d908 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -5,7 +5,6 @@ pub mod utilization; pub use utilization::PageserverUtilization; use std::{ - borrow::Cow, collections::HashMap, io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, @@ -20,7 +19,6 @@ use serde::{Deserialize, Serialize}; use serde_with::serde_as; use utils::{ completion, - history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, lsn::Lsn, serde_system_time, @@ -726,58 +724,7 @@ pub struct LayerMapInfo { pub historic_layers: Vec, } -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)] -#[repr(usize)] -pub enum LayerAccessKind { - GetValueReconstructData, - Iter, - KeyIter, - Dump, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LayerAccessStatFullDetails { - pub when_millis_since_epoch: u64, - pub task_kind: Cow<'static, str>, - pub access_kind: LayerAccessKind, -} - -/// An event that impacts the layer's residence status. -#[serde_as] -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LayerResidenceEvent { - /// The time when the event occurred. - /// NB: this timestamp is captured while the residence status changes. - /// So, it might be behind/ahead of the actual residence change by a short amount of time. - /// - #[serde(rename = "timestamp_millis_since_epoch")] - #[serde_as(as = "serde_with::TimestampMilliSeconds")] - pub timestamp: SystemTime, - /// The new residence status of the layer. - pub status: LayerResidenceStatus, - /// The reason why we had to record this event. - pub reason: LayerResidenceEventReason, -} - -/// The reason for recording a given [`LayerResidenceEvent`]. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub enum LayerResidenceEventReason { - /// The layer map is being populated, e.g. during timeline load or attach. - /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`]. - /// We need to record such events because there is no persistent storage for the events. - /// - // https://github.com/rust-lang/rust/issues/74481 - /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html - /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote - LayerLoad, - /// We just created the layer (e.g., freeze_and_flush or compaction). - /// Such layers are always [`LayerResidenceStatus::Resident`]. - LayerCreate, - /// We on-demand downloaded or evicted the given layer. - ResidenceChange, -} - -/// The residence status of the layer, after the given [`LayerResidenceEvent`]. +/// The residence status of a layer #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub enum LayerResidenceStatus { /// Residence status for a layer file that exists locally. @@ -787,23 +734,16 @@ pub enum LayerResidenceStatus { Evicted, } -impl LayerResidenceEvent { - pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self { - Self { - status, - reason, - timestamp: SystemTime::now(), - } - } -} - +#[serde_as] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerAccessStats { - pub access_count_by_access_kind: HashMap, - pub task_kind_access_flag: Vec>, - pub first: Option, - pub accesses_history: HistoryBufferWithDropCounter, - pub residence_events_history: HistoryBufferWithDropCounter, + #[serde_as(as = "serde_with::TimestampMilliSeconds")] + pub access_time: SystemTime, + + #[serde_as(as = "serde_with::TimestampMilliSeconds")] + pub residence_time: SystemTime, + + pub visible: bool, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/libs/postgres_ffi/src/controlfile_utils.rs b/libs/postgres_ffi/src/controlfile_utils.rs index 0918d15001..eaa9450294 100644 --- a/libs/postgres_ffi/src/controlfile_utils.rs +++ b/libs/postgres_ffi/src/controlfile_utils.rs @@ -29,7 +29,7 @@ use anyhow::{bail, Result}; use bytes::{Bytes, BytesMut}; /// Equivalent to sizeof(ControlFileData) in C -const SIZEOF_CONTROLDATA: usize = std::mem::size_of::(); +const SIZEOF_CONTROLDATA: usize = size_of::(); impl ControlFileData { /// Compute the offset of the `crc` field within the `ControlFileData` struct. diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 54b032d138..6ce855c78e 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -31,7 +31,7 @@ pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; // // Assumes 8 byte alignment -const SIZEOF_PAGE_HEADER_DATA: usize = std::mem::size_of::(); +const SIZEOF_PAGE_HEADER_DATA: usize = size_of::(); pub const MAXALIGN_SIZE_OF_PAGE_HEADER_DATA: usize = (SIZEOF_PAGE_HEADER_DATA + 7) & !7; // @@ -191,7 +191,7 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; -pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::() as u32; +pub const SIZEOF_XLOGRECORD: u32 = size_of::() as u32; // // from xlogrecord.h diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index d25b23663b..9fe7e8198b 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -42,9 +42,9 @@ pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8; pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2; -pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::(); -pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::(); -pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); +pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = size_of::(); +pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = size_of::(); +pub const XLOG_SIZE_OF_XLOG_RECORD: usize = size_of::(); #[allow(clippy::identity_op)] pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; @@ -311,7 +311,7 @@ impl XLogLongPageHeaderData { } } -pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::(); +pub const SIZEOF_CHECKPOINT: usize = size_of::(); impl CheckPoint { pub fn encode(&self) -> Result { diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 750affc94e..79d45de67a 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -178,7 +178,7 @@ pub fn test_find_end_of_wal_last_crossing_segment() { /// currently 1024. #[test] pub fn test_update_next_xid() { - let checkpoint_buf = [0u8; std::mem::size_of::()]; + let checkpoint_buf = [0u8; size_of::()]; let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); checkpoint.nextXid = FullTransactionId { value: 10 }; @@ -204,7 +204,7 @@ pub fn test_update_next_xid() { #[test] pub fn test_update_next_multixid() { - let checkpoint_buf = [0u8; std::mem::size_of::()]; + let checkpoint_buf = [0u8; size_of::()]; let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); // simple case diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 23d82b90bd..414bce1b26 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true async-trait.workspace = true +async-stream.workspace = true once_cell.workspace = true aws-smithy-async.workspace = true aws-smithy-types.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 8e590b17c4..6ca4ae43f2 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -15,7 +15,7 @@ use std::time::SystemTime; use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use anyhow::Result; use azure_core::request_options::{MaxResults, Metadata, Range}; -use azure_core::RetryOptions; +use azure_core::{Continuable, RetryOptions}; use azure_identity::DefaultAzureCredential; use azure_storage::StorageCredentials; use azure_storage_blobs::blob::CopyStatus; @@ -33,6 +33,7 @@ use tracing::debug; use utils::backoff; use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; +use crate::ListingObject; use crate::{ config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, @@ -40,6 +41,7 @@ use crate::{ pub struct AzureBlobStorage { client: ContainerClient, + container_name: String, prefix_in_container: Option, max_keys_per_list_response: Option, concurrency_limiter: ConcurrencyLimiter, @@ -85,6 +87,7 @@ impl AzureBlobStorage { Ok(AzureBlobStorage { client, + container_name: azure_config.container_name.to_owned(), prefix_in_container: azure_config.prefix_in_container.to_owned(), max_keys_per_list_response, concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), @@ -238,6 +241,10 @@ impl AzureBlobStorage { _ = cancel.cancelled() => Err(Cancelled), } } + + pub fn container_name(&self) -> &str { + &self.container_name + } } fn to_azure_metadata(metadata: StorageMetadata) -> Metadata { @@ -261,30 +268,30 @@ fn to_download_error(error: azure_core::Error) -> DownloadError { } impl RemoteStorage for AzureBlobStorage { - async fn list( + fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> anyhow::Result { - let _permit = self.permit(RequestKind::List, cancel).await?; + ) -> impl Stream> { + // get the passed prefix or if it is not set use prefix_in_bucket value + let list_prefix = prefix + .map(|p| self.relative_path_to_name(p)) + .or_else(|| self.prefix_in_container.clone()) + .map(|mut p| { + // required to end with a separator + // otherwise request will return only the entry of a prefix + if matches!(mode, ListingMode::WithDelimiter) + && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) + { + p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + } + p + }); - let op = async { - // get the passed prefix or if it is not set use prefix_in_bucket value - let list_prefix = prefix - .map(|p| self.relative_path_to_name(p)) - .or_else(|| self.prefix_in_container.clone()) - .map(|mut p| { - // required to end with a separator - // otherwise request will return only the entry of a prefix - if matches!(mode, ListingMode::WithDelimiter) - && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) - { - p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - } - p - }); + async_stream::stream! { + let _permit = self.permit(RequestKind::List, cancel).await?; let mut builder = self.client.list_blobs(); @@ -300,21 +307,43 @@ impl RemoteStorage for AzureBlobStorage { builder = builder.max_results(MaxResults::new(limit)); } - let response = builder.into_stream(); - let response = response.into_stream().map_err(to_download_error); - let response = tokio_stream::StreamExt::timeout(response, self.timeout); - let response = response.map(|res| match res { - Ok(res) => res, - Err(_elapsed) => Err(DownloadError::Timeout), - }); + let mut next_marker = None; - let mut response = std::pin::pin!(response); + 'outer: loop { + let mut builder = builder.clone(); + if let Some(marker) = next_marker.clone() { + builder = builder.marker(marker); + } + let response = builder.into_stream(); + let response = response.into_stream().map_err(to_download_error); + let response = tokio_stream::StreamExt::timeout(response, self.timeout); + let response = response.map(|res| match res { + Ok(res) => res, + Err(_elapsed) => Err(DownloadError::Timeout), + }); - let mut res = Listing::default(); + let mut response = std::pin::pin!(response); - let mut max_keys = max_keys.map(|mk| mk.get()); - while let Some(entry) = response.next().await { - let entry = entry?; + let mut max_keys = max_keys.map(|mk| mk.get()); + let next_item = tokio::select! { + op = response.next() => Ok(op), + _ = cancel.cancelled() => Err(DownloadError::Cancelled), + }?; + let Some(entry) = next_item else { + // The list is complete, so yield it. + break; + }; + + let mut res = Listing::default(); + let entry = match entry { + Ok(entry) => entry, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue; + } + }; + next_marker = entry.continuation(); let prefix_iter = entry .blobs .prefixes() @@ -324,7 +353,11 @@ impl RemoteStorage for AzureBlobStorage { let blob_iter = entry .blobs .blobs() - .map(|k| self.name_to_relative_path(&k.name)); + .map(|k| ListingObject{ + key: self.name_to_relative_path(&k.name), + last_modified: k.properties.last_modified.into() + } + ); for key in blob_iter { res.keys.push(key); @@ -333,19 +366,19 @@ impl RemoteStorage for AzureBlobStorage { assert!(mk > 0); mk -= 1; if mk == 0 { - return Ok(res); // limit reached + yield Ok(res); // limit reached + break 'outer; } max_keys = Some(mk); } } + yield Ok(res); + + // We are done here + if next_marker.is_none() { + break; + } } - - Ok(res) - }; - - tokio::select! { - res = op => res, - _ = cancel.cancelled() => Err(DownloadError::Cancelled), } } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 3381c4296f..75aa28233b 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -26,7 +26,7 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use bytes::Bytes; -use futures::stream::Stream; +use futures::{stream::Stream, StreamExt}; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; @@ -149,10 +149,16 @@ pub enum ListingMode { NoDelimiter, } +#[derive(PartialEq, Eq, Debug)] +pub struct ListingObject { + pub key: RemotePath, + pub last_modified: SystemTime, +} + #[derive(Default)] pub struct Listing { pub prefixes: Vec, - pub keys: Vec, + pub keys: Vec, } /// Storage (potentially remote) API to manage its state. @@ -160,13 +166,18 @@ pub struct Listing { /// providing basic CRUD operations for storage files. #[allow(async_fn_in_trait)] pub trait RemoteStorage: Send + Sync + 'static { - /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2. - /// (see ``) + /// List objects in remote storage, with semantics matching AWS S3's [`ListObjectsV2`]. + /// + /// The stream is guaranteed to return at least one element, even in the case of errors + /// (in that case it's an `Err()`), or an empty `Listing`. + /// + /// The stream is not ending if it returns an error, as long as [`is_permanent`] returns false on the error. + /// The `next` function can be retried, and maybe in a future retry, there will be success. /// /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not /// from the absolute root of the bucket. /// - /// `mode` configures whether to use a delimiter. Without a delimiter all keys + /// `mode` configures whether to use a delimiter. Without a delimiter, all keys /// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are /// returned in `keys` (). @@ -175,13 +186,32 @@ pub trait RemoteStorage: Send + Sync + 'static { /// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure. /// + /// [`ListObjectsV2`]: + /// [`is_permanent`]: DownloadError::is_permanent + fn list_streaming( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream>; + async fn list( &self, prefix: Option<&RemotePath>, - _mode: ListingMode, + mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> Result; + ) -> Result { + let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel)); + let mut combined = stream.next().await.expect("At least one item required")?; + while let Some(list) = stream.next().await { + let list = list?; + combined.keys.extend(list.keys.into_iter()); + combined.prefixes.extend_from_slice(&list.prefixes); + } + Ok(combined) + } /// Streams the local file contents into remote into the remote storage entry. /// @@ -288,8 +318,8 @@ impl Debug for Download { /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. -#[derive(Clone)] // Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925 +#[derive(Clone)] pub enum GenericRemoteStorage> { LocalFs(LocalFs), AwsS3(Arc), @@ -298,13 +328,14 @@ pub enum GenericRemoteStorage> { } impl GenericRemoteStorage> { + // See [`RemoteStorage::list`]. pub async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> anyhow::Result { + ) -> Result { match self { Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await, Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await, @@ -313,6 +344,23 @@ impl GenericRemoteStorage> { } } + // See [`RemoteStorage::list_streaming`]. + pub fn list_streaming<'a>( + &'a self, + prefix: Option<&'a RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &'a CancellationToken, + ) -> impl Stream> + 'a { + match self { + Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)) + as Pin>>>, + Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), + Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), + Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), + } + } + /// See [`RemoteStorage::upload`] pub async fn upload( &self, @@ -504,6 +552,16 @@ impl GenericRemoteStorage { None => self.download(from, cancel).await, } } + + /// The name of the bucket/container/etc. + pub fn bucket_name(&self) -> Option<&str> { + match self { + Self::LocalFs(_s) => None, + Self::AwsS3(s) => Some(s.bucket_name()), + Self::AzureBlob(s) => Some(s.container_name()), + Self::Unreliable(_s) => None, + } + } } /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 1f7bcfc982..bc6b10aa51 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken}; use utils::crashsafe::path_with_suffix_extension; use crate::{ - Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel, - REMOTE_STORAGE_PREFIX_SEPARATOR, + Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, TimeTravelError, + TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, }; use super::{RemoteStorage, StorageMetadata}; @@ -331,6 +331,17 @@ impl LocalFs { } impl RemoteStorage for LocalFs { + fn list_streaming( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> { + let listing = self.list(prefix, mode, max_keys, cancel); + futures::stream::once(listing) + } + async fn list( &self, prefix: Option<&RemotePath>, @@ -346,19 +357,28 @@ impl RemoteStorage for LocalFs { .list_recursive(prefix) .await .map_err(DownloadError::Other)?; - let keys = keys + let objects = keys .into_iter() - .filter(|k| { + .filter_map(|k| { let path = k.with_base(&self.storage_root); - !path.is_dir() + if path.is_dir() { + None + } else { + Some(ListingObject { + key: k.clone(), + // LocalFs is just for testing, so just specify a dummy time + last_modified: SystemTime::now(), + }) + } }) .collect(); if let ListingMode::NoDelimiter = mode { - result.keys = keys; + result.keys = objects; } else { let mut prefixes = HashSet::new(); - for key in keys { + for object in objects { + let key = object.key; // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`. let relative_key = if let Some(prefix) = prefix { let mut prefix = prefix.clone(); @@ -387,9 +407,11 @@ impl RemoteStorage for LocalFs { .to_owned(); prefixes.insert(first_part); } else { - result - .keys - .push(RemotePath::from_string(&relative_key).unwrap()); + result.keys.push(ListingObject { + key: RemotePath::from_string(&relative_key).unwrap(), + // LocalFs is just for testing + last_modified: SystemTime::now(), + }); } } result.prefixes = prefixes @@ -939,7 +961,11 @@ mod fs_tests { .await?; assert!(listing.prefixes.is_empty()); assert_eq!( - listing.keys.into_iter().collect::>(), + listing + .keys + .into_iter() + .map(|o| o.key) + .collect::>(), HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()]) ); @@ -964,7 +990,7 @@ mod fs_tests { ) .await?; assert_eq!( - listing.keys, + listing.keys.into_iter().map(|o| o.key).collect::>(), [RemotePath::from_string("uncle").unwrap()].to_vec() ); assert_eq!( @@ -981,7 +1007,7 @@ mod fs_tests { &cancel, ) .await?; - assert_eq!(listing.keys, [].to_vec()); + assert_eq!(listing.keys, vec![]); assert_eq!( listing.prefixes, [RemotePath::from_string("grandparent").unwrap()].to_vec() @@ -996,7 +1022,7 @@ mod fs_tests { &cancel, ) .await?; - assert_eq!(listing.keys, [].to_vec()); + assert_eq!(listing.keys, vec![]); assert_eq!( listing.prefixes, [RemotePath::from_string("grandparent").unwrap()].to_vec() @@ -1029,7 +1055,7 @@ mod fs_tests { &cancel, ) .await?; - assert_eq!(listing.keys, [].to_vec()); + assert_eq!(listing.keys, vec![]); let mut found_prefixes = listing.prefixes.clone(); found_prefixes.sort(); diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index b65d8b7e9e..412f307445 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -44,8 +44,9 @@ use crate::{ error::Cancelled, metrics::{start_counting_cancelled_wait, start_measuring_requests}, support::PermitCarrying, - ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, - TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, + ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, + RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, + REMOTE_STORAGE_PREFIX_SEPARATOR, }; use crate::metrics::AttemptOutcome; @@ -386,6 +387,10 @@ impl S3Bucket { } Ok(()) } + + pub fn bucket_name(&self) -> &str { + &self.bucket_name + } } pin_project_lite::pin_project! { @@ -463,17 +468,16 @@ impl>> Stream for TimedDownload { } impl RemoteStorage for S3Bucket { - async fn list( + fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> Result { + ) -> impl Stream> { let kind = RequestKind::List; // s3 sdk wants i32 let mut max_keys = max_keys.map(|mk| mk.get() as i32); - let mut result = Listing::default(); // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix @@ -485,89 +489,116 @@ impl RemoteStorage for S3Bucket { }) }); - let _permit = self.permit(kind, cancel).await?; + async_stream::stream! { + let _permit = self.permit(kind, cancel).await?; - let mut continuation_token = None; + let mut continuation_token = None; + 'outer: loop { + let started_at = start_measuring_requests(kind); - loop { - let started_at = start_measuring_requests(kind); + // min of two Options, returning Some if one is value and another is + // None (None is smaller than anything, so plain min doesn't work). + let request_max_keys = self + .max_keys_per_list_response + .into_iter() + .chain(max_keys.into_iter()) + .min(); + let mut request = self + .client + .list_objects_v2() + .bucket(self.bucket_name.clone()) + .set_prefix(list_prefix.clone()) + .set_continuation_token(continuation_token.clone()) + .set_max_keys(request_max_keys); - // min of two Options, returning Some if one is value and another is - // None (None is smaller than anything, so plain min doesn't work). - let request_max_keys = self - .max_keys_per_list_response - .into_iter() - .chain(max_keys.into_iter()) - .min(); - let mut request = self - .client - .list_objects_v2() - .bucket(self.bucket_name.clone()) - .set_prefix(list_prefix.clone()) - .set_continuation_token(continuation_token) - .set_max_keys(request_max_keys); - - if let ListingMode::WithDelimiter = mode { - request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); - } - - let request = request.send(); - - let response = tokio::select! { - res = request => res, - _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout), - _ = cancel.cancelled() => return Err(DownloadError::Cancelled), - }; - - let response = response - .context("Failed to list S3 prefixes") - .map_err(DownloadError::Other); - - let started_at = ScopeGuard::into_inner(started_at); - - crate::metrics::BUCKET_METRICS - .req_seconds - .observe_elapsed(kind, &response, started_at); - - let response = response?; - - let keys = response.contents(); - let empty = Vec::new(); - let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty); - - tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); - - for object in keys { - let object_path = object.key().expect("response does not contain a key"); - let remote_path = self.s3_object_to_relative_path(object_path); - result.keys.push(remote_path); - if let Some(mut mk) = max_keys { - assert!(mk > 0); - mk -= 1; - if mk == 0 { - return Ok(result); // limit reached - } - max_keys = Some(mk); + if let ListingMode::WithDelimiter = mode { + request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); } + + let request = request.send(); + + let response = tokio::select! { + res = request => Ok(res), + _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout), + _ = cancel.cancelled() => Err(DownloadError::Cancelled), + }?; + + let response = response + .context("Failed to list S3 prefixes") + .map_err(DownloadError::Other); + + let started_at = ScopeGuard::into_inner(started_at); + + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &response, started_at); + + let response = match response { + Ok(response) => response, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue 'outer; + }, + }; + + let keys = response.contents(); + let prefixes = response.common_prefixes.as_deref().unwrap_or_default(); + + tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); + let mut result = Listing::default(); + + for object in keys { + let key = object.key().expect("response does not contain a key"); + let key = self.s3_object_to_relative_path(key); + + let last_modified = match object.last_modified.map(SystemTime::try_from) { + Some(Ok(t)) => t, + Some(Err(_)) => { + tracing::warn!("Remote storage last_modified {:?} for {} is out of bounds", + object.last_modified, key + ); + SystemTime::now() + }, + None => { + SystemTime::now() + } + }; + + result.keys.push(ListingObject{ + key, + last_modified + }); + if let Some(mut mk) = max_keys { + assert!(mk > 0); + mk -= 1; + if mk == 0 { + // limit reached + yield Ok(result); + break 'outer; + } + max_keys = Some(mk); + } + } + + // S3 gives us prefixes like "foo/", we return them like "foo" + result.prefixes.extend(prefixes.iter().filter_map(|o| { + Some( + self.s3_object_to_relative_path( + o.prefix()? + .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR), + ), + ) + })); + + yield Ok(result); + + continuation_token = match response.next_continuation_token { + Some(new_token) => Some(new_token), + None => break, + }; } - - // S3 gives us prefixes like "foo/", we return them like "foo" - result.prefixes.extend(prefixes.iter().filter_map(|o| { - Some( - self.s3_object_to_relative_path( - o.prefix()? - .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR), - ), - ) - })); - - continuation_token = match response.next_continuation_token { - Some(new_token) => Some(new_token), - None => break, - }; } - - Ok(result) } async fn upload( diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index c467a2d196..67e5be2955 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -3,6 +3,7 @@ //! testing purposes. use bytes::Bytes; use futures::stream::Stream; +use futures::StreamExt; use std::collections::HashMap; use std::num::NonZeroU32; use std::sync::Mutex; @@ -107,6 +108,23 @@ impl UnreliableWrapper { type VoidStorage = crate::LocalFs; impl RemoteStorage for UnreliableWrapper { + fn list_streaming( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> { + async_stream::stream! { + self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) + .map_err(DownloadError::Other)?; + let mut stream = self.inner + .list_streaming(prefix, mode, max_keys, cancel); + while let Some(item) = stream.next().await { + yield item; + } + } + } async fn list( &self, prefix: Option<&RemotePath>, diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs index da9dc08d8d..daab05d91a 100644 --- a/libs/remote_storage/tests/common/mod.rs +++ b/libs/remote_storage/tests/common/mod.rs @@ -152,7 +152,7 @@ pub(crate) async fn upload_remote_data( let mut upload_tasks = JoinSet::new(); let cancel = CancellationToken::new(); - for i in 1..upload_tasks_count + 1 { + for i in 1..=upload_tasks_count { let task_client = Arc::clone(client); let cancel = cancel.clone(); diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index 673151c8ef..86c55872c1 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,5 +1,6 @@ use anyhow::Context; use camino::Utf8Path; +use futures::StreamExt; use remote_storage::ListingMode; use remote_storage::RemotePath; use std::sync::Arc; @@ -29,10 +30,10 @@ use super::{ /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` /// -/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys. -/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, -/// since current default AWS S3 pagination limit is 1000. -/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax) +/// In the `MaybeEnabledStorageWithTestBlobs::setup`, we set the `max_keys_in_list_response` param to limit the keys in a single response. +/// This way, we are able to test the pagination, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, +/// as the current default AWS S3 pagination limit is 1000. +/// (see ). /// /// Lastly, the test attempts to clean up and remove all uploaded S3 files. /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. @@ -87,6 +88,41 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); + // list_streaming + + let prefix_with_slash = base_prefix.add_trailing_slash(); + let mut nested_remote_prefixes_st = test_client.list_streaming( + Some(&prefix_with_slash), + ListingMode::WithDelimiter, + None, + &cancel, + ); + let mut nested_remote_prefixes_combined = HashSet::new(); + let mut segments = 0; + let mut segment_max_size = 0; + while let Some(st) = nested_remote_prefixes_st.next().await { + let st = st?; + segment_max_size = segment_max_size.max(st.prefixes.len()); + nested_remote_prefixes_combined.extend(st.prefixes.into_iter()); + segments += 1; + } + assert!(segments > 1, "less than 2 segments: {segments}"); + assert!( + segment_max_size * 2 <= nested_remote_prefixes_combined.len(), + "double of segment_max_size={segment_max_size} larger number of remote prefixes of {}", + nested_remote_prefixes_combined.len() + ); + let remote_only_prefixes = nested_remote_prefixes_combined + .difference(&expected_remote_prefixes) + .collect::>(); + let missing_uploaded_prefixes = expected_remote_prefixes + .difference(&nested_remote_prefixes_combined) + .collect::>(); + assert_eq!( + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", + ); + Ok(()) } @@ -120,6 +156,7 @@ async fn list_no_delimiter_works( .context("client list root files failure")? .keys .into_iter() + .map(|o| o.key) .collect::>(); assert_eq!( root_files, @@ -146,6 +183,7 @@ async fn list_no_delimiter_works( .context("client list nested files failure")? .keys .into_iter() + .map(|o| o.key) .collect::>(); let trim_remote_blobs: HashSet<_> = ctx .remote_blobs diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 342bc6da0b..b893beeebd 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -81,6 +81,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: .context("list root files failure")? .keys .into_iter() + .map(|o| o.key) .collect::>(), ) } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 261ca2cc1a..ec05f849cf 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -20,7 +20,6 @@ bincode.workspace = true bytes.workspace = true camino.workspace = true chrono.workspace = true -heapless.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true hyper = { workspace = true, features = ["full"] } diff --git a/libs/utils/src/history_buffer.rs b/libs/utils/src/history_buffer.rs deleted file mode 100644 index bd35e2bad6..0000000000 --- a/libs/utils/src/history_buffer.rs +++ /dev/null @@ -1,196 +0,0 @@ -//! A heapless buffer for events of sorts. - -use std::ops; - -use heapless::HistoryBuffer; - -#[derive(Debug, Clone)] -pub struct HistoryBufferWithDropCounter { - buffer: HistoryBuffer, - drop_count: u64, -} - -impl HistoryBufferWithDropCounter { - pub fn write(&mut self, data: T) { - let len_before = self.buffer.len(); - self.buffer.write(data); - let len_after = self.buffer.len(); - self.drop_count += u64::from(len_before == len_after); - } - pub fn drop_count(&self) -> u64 { - self.drop_count - } - pub fn map U>(&self, f: F) -> HistoryBufferWithDropCounter { - let mut buffer = HistoryBuffer::new(); - buffer.extend(self.buffer.oldest_ordered().map(f)); - HistoryBufferWithDropCounter:: { - buffer, - drop_count: self.drop_count, - } - } -} - -impl Default for HistoryBufferWithDropCounter { - fn default() -> Self { - Self { - buffer: HistoryBuffer::default(), - drop_count: 0, - } - } -} - -impl ops::Deref for HistoryBufferWithDropCounter { - type Target = HistoryBuffer; - - fn deref(&self) -> &Self::Target { - &self.buffer - } -} - -#[derive(serde::Serialize, serde::Deserialize)] -struct SerdeRepr { - buffer: Vec, - buffer_size: usize, - drop_count: u64, -} - -impl<'a, T, const L: usize> From<&'a HistoryBufferWithDropCounter> for SerdeRepr -where - T: Clone + serde::Serialize, -{ - fn from(value: &'a HistoryBufferWithDropCounter) -> Self { - let HistoryBufferWithDropCounter { buffer, drop_count } = value; - SerdeRepr { - buffer: buffer.iter().cloned().collect(), - buffer_size: L, - drop_count: *drop_count, - } - } -} - -impl serde::Serialize for HistoryBufferWithDropCounter -where - T: Clone + serde::Serialize, -{ - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - SerdeRepr::from(self).serialize(serializer) - } -} - -impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter -where - T: Clone + serde::Deserialize<'de>, -{ - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - let SerdeRepr { - buffer: des_buffer, - drop_count, - buffer_size, - } = SerdeRepr::::deserialize(deserializer)?; - if buffer_size != L { - use serde::de::Error; - return Err(D::Error::custom(format!( - "invalid buffer_size, expecting {L} got {buffer_size}" - ))); - } - let mut buffer = HistoryBuffer::new(); - buffer.extend(des_buffer); - Ok(HistoryBufferWithDropCounter { buffer, drop_count }) - } -} - -#[cfg(test)] -mod test { - use super::HistoryBufferWithDropCounter; - - #[test] - fn test_basics() { - let mut b = HistoryBufferWithDropCounter::::default(); - b.write(1); - b.write(2); - b.write(3); - assert!(b.iter().any(|e| *e == 2)); - assert!(b.iter().any(|e| *e == 3)); - assert!(!b.iter().any(|e| *e == 1)); - - // round-trip serde - let round_tripped: HistoryBufferWithDropCounter = - serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap(); - assert_eq!( - round_tripped.iter().cloned().collect::>(), - b.iter().cloned().collect::>() - ); - } - - #[test] - fn test_drop_count_works() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - b.write(1); - assert_eq!(b.drop_count(), 0); - b.write(2); - assert_eq!(b.drop_count(), 0); - b.write(3); - assert_eq!(b.drop_count(), 1); - b.write(4); - assert_eq!(b.drop_count(), 2); - } - - #[test] - fn test_clone_works() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - b.write(1); - b.write(2); - b.write(3); - assert_eq!(b.drop_count(), 1); - let mut c = b.clone(); - assert_eq!(c.drop_count(), 1); - assert!(c.iter().any(|e| *e == 2)); - assert!(c.iter().any(|e| *e == 3)); - assert!(!c.iter().any(|e| *e == 1)); - - c.write(4); - assert!(c.iter().any(|e| *e == 4)); - assert!(!b.iter().any(|e| *e == 4)); - } - - #[test] - fn test_map() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - - b.write(1); - assert_eq!(b.drop_count(), 0); - { - let c = b.map(|i| i + 10); - assert_eq!(c.oldest_ordered().cloned().collect::>(), vec![11]); - assert_eq!(c.drop_count(), 0); - } - - b.write(2); - assert_eq!(b.drop_count(), 0); - { - let c = b.map(|i| i + 10); - assert_eq!( - c.oldest_ordered().cloned().collect::>(), - vec![11, 12] - ); - assert_eq!(c.drop_count(), 0); - } - - b.write(3); - assert_eq!(b.drop_count(), 1); - { - let c = b.map(|i| i + 10); - assert_eq!( - c.oldest_ordered().cloned().collect::>(), - vec![12, 13] - ); - assert_eq!(c.drop_count(), 1); - } - } -} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 9ad1752fb7..a46d68ef33 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -59,8 +59,6 @@ pub mod signals; pub mod fs_ext; -pub mod history_buffer; - pub mod measured_stream; pub mod serde_percent; diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index 4f9ac6bdb4..f6b430657e 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -49,6 +49,7 @@ pub struct TenantShardId { impl ShardCount { pub const MAX: Self = Self(u8::MAX); + pub const MIN: Self = Self(0); /// The internal value of a ShardCount may be zero, which means "1 shard, but use /// legacy format for TenantShardId that excludes the shard suffix", also known diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 28846ffbae..7a96c86ded 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -19,9 +19,13 @@ use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_evicti use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; use pageserver::tenant::{secondary, TenantSharedResources}; +use pageserver::{ + CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener, +}; use remote_storage::GenericRemoteStorage; use tokio::signal::unix::SignalKind; use tokio::time::Instant; +use tokio_util::sync::CancellationToken; use tracing::*; use metrics::set_build_info_metric; @@ -286,6 +290,7 @@ fn start_pageserver( // Create and lock PID file. This ensures that there cannot be more than one // pageserver process running at the same time. let lock_file_path = conf.workdir.join(PID_FILE_NAME); + info!("Claiming pid file at {lock_file_path:?}..."); let lock_file = utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; info!("Claimed pid file at {lock_file_path:?}"); @@ -411,8 +416,10 @@ fn start_pageserver( // Scan the local 'tenants/' directory and start loading the tenants let deletion_queue_client = deletion_queue.new_client(); + let background_purges = mgr::BackgroundPurges::default(); let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( conf, + background_purges.clone(), TenantSharedResources { broker_client: broker_client.clone(), remote_storage: remote_storage.clone(), @@ -504,7 +511,7 @@ fn start_pageserver( } }); - let secondary_controller = secondary::spawn_tasks( + let (secondary_controller, secondary_controller_tasks) = secondary::spawn_tasks( tenant_manager.clone(), remote_storage.clone(), background_jobs_barrier.clone(), @@ -517,18 +524,19 @@ fn start_pageserver( // been configured. let disk_usage_eviction_state: Arc = Arc::default(); - launch_disk_usage_global_eviction_task( + let disk_usage_eviction_task = launch_disk_usage_global_eviction_task( conf, remote_storage.clone(), disk_usage_eviction_state.clone(), tenant_manager.clone(), background_jobs_barrier.clone(), - )?; + ); // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. - { - let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); + let http_endpoint_listener = { + let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper + let cancel = CancellationToken::new(); let router_state = Arc::new( http::routes::State::new( @@ -549,77 +557,44 @@ fn start_pageserver( let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) - .with_graceful_shutdown(task_mgr::shutdown_watcher()); + .with_graceful_shutdown({ + let cancel = cancel.clone(); + async move { cancel.clone().cancelled().await } + }); - task_mgr::spawn( - MGMT_REQUEST_RUNTIME.handle(), - TaskKind::HttpEndpointListener, - None, - None, + let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "http endpoint listener", - true, - async { - server.await?; - Ok(()) - }, - ); - } + server, + )); + HttpEndpointListener(CancellableTask { task, cancel }) + }; - if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint { - let metrics_ctx = RequestContext::todo_child( - TaskKind::MetricsCollection, - // This task itself shouldn't download anything. - // The actual size calculation does need downloads, and - // creates a child context with the right DownloadBehavior. - DownloadBehavior::Error, - ); + let consumption_metrics_tasks = { + let cancel = shutdown_pageserver.child_token(); + let task = crate::BACKGROUND_RUNTIME.spawn({ + let tenant_manager = tenant_manager.clone(); + let cancel = cancel.clone(); + async move { + // first wait until background jobs are cleared to launch. + // + // this is because we only process active tenants and timelines, and the + // Timeline::get_current_logical_size will spawn the logical size calculation, + // which will not be rate-limited. + tokio::select! { + _ = cancel.cancelled() => { return; }, + _ = background_jobs_barrier.wait() => {} + }; - let local_disk_storage = conf.workdir.join("last_consumption_metrics.json"); - - task_mgr::spawn( - crate::BACKGROUND_RUNTIME.handle(), - TaskKind::MetricsCollection, - None, - None, - "consumption metrics collection", - true, - { - let tenant_manager = tenant_manager.clone(); - async move { - // first wait until background jobs are cleared to launch. - // - // this is because we only process active tenants and timelines, and the - // Timeline::get_current_logical_size will spawn the logical size calculation, - // which will not be rate-limited. - let cancel = task_mgr::shutdown_token(); - - tokio::select! { - _ = cancel.cancelled() => { return Ok(()); }, - _ = background_jobs_barrier.wait() => {} - }; - - pageserver::consumption_metrics::collect_metrics( - tenant_manager, - metric_collection_endpoint, - &conf.metric_collection_bucket, - conf.metric_collection_interval, - conf.synthetic_size_calculation_interval, - conf.id, - local_disk_storage, - cancel, - metrics_ctx, - ) - .instrument(info_span!("metrics_collection")) - .await?; - Ok(()) - } - }, - ); - } + pageserver::consumption_metrics::run(conf, tenant_manager, cancel).await; + } + }); + ConsumptionMetricsTasks(CancellableTask { task, cancel }) + }; // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. We created the listener earlier already. - { + let libpq_listener = { + let cancel = CancellationToken::new(); let libpq_ctx = RequestContext::todo_child( TaskKind::LibpqEndpointListener, // listener task shouldn't need to download anything. (We will @@ -628,29 +603,20 @@ fn start_pageserver( // accept connections.) DownloadBehavior::Error, ); - task_mgr::spawn( - COMPUTE_REQUEST_RUNTIME.handle(), - TaskKind::LibpqEndpointListener, - None, - None, - "libpq endpoint listener", - true, - { - let tenant_manager = tenant_manager.clone(); - async move { - page_service::libpq_listener_main( - tenant_manager, - pg_auth, - pageserver_listener, - conf.pg_auth_type, - libpq_ctx, - task_mgr::shutdown_token(), - ) - .await - } - }, - ); - } + + let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "libpq listener", + page_service::libpq_listener_main( + tenant_manager.clone(), + pg_auth, + pageserver_listener, + conf.pg_auth_type, + libpq_ctx, + cancel.clone(), + ), + )); + LibpqEndpointListener(CancellableTask { task, cancel }) + }; let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); @@ -676,7 +642,18 @@ fn start_pageserver( // Right now that tree doesn't reach very far, and `task_mgr` is used instead. // The plan is to change that over time. shutdown_pageserver.take(); - pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await; + pageserver::shutdown_pageserver( + http_endpoint_listener, + libpq_listener, + consumption_metrics_tasks, + disk_usage_eviction_task, + &tenant_manager, + background_purges, + deletion_queue.clone(), + secondary_controller_tasks, + 0, + ) + .await; unreachable!() }) } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b4359b926d..f71881683d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -52,7 +52,7 @@ pub mod defaults { use pageserver_api::models::ImageCompressionAlgorithm; pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; - pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; @@ -83,16 +83,16 @@ pub mod defaults { #[cfg(not(target_os = "linux"))] pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs"; - pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential"; + pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored"; - pub const DEFAULT_GET_IMPL: &str = "legacy"; + pub const DEFAULT_GET_IMPL: &str = "vectored"; pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = ImageCompressionAlgorithm::Disabled; - pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true; + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; @@ -356,8 +356,6 @@ struct PageServerConfigBuilder { auth_validation_public_key_path: BuilderValue>, remote_storage_config: BuilderValue>, - id: BuilderValue, - broker_endpoint: BuilderValue, broker_keepalive_interval: BuilderValue, @@ -406,11 +404,8 @@ struct PageServerConfigBuilder { } impl PageServerConfigBuilder { - fn new(node_id: NodeId) -> Self { - let mut this = Self::default(); - this.id(node_id); - - this + fn new() -> Self { + Self::default() } #[inline(always)] @@ -438,7 +433,6 @@ impl PageServerConfigBuilder { pg_auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), - id: NotSet, broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT .parse() .expect("failed to parse default broker endpoint")), @@ -568,10 +562,6 @@ impl PageServerConfigBuilder { self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval) } - pub fn id(&mut self, node_id: NodeId) { - self.id = BuilderValue::Set(node_id) - } - pub fn log_format(&mut self, log_format: LogFormat) { self.log_format = BuilderValue::Set(log_format) } @@ -683,7 +673,7 @@ impl PageServerConfigBuilder { self.l0_flush = BuilderValue::Set(value); } - pub fn build(self) -> anyhow::Result { + pub fn build(self, id: NodeId) -> anyhow::Result { let default = Self::default_values(); macro_rules! conf { @@ -716,7 +706,6 @@ impl PageServerConfigBuilder { pg_auth_type, auth_validation_public_key_path, remote_storage_config, - id, broker_endpoint, broker_keepalive_interval, log_format, @@ -744,6 +733,7 @@ impl PageServerConfigBuilder { } CUSTOM LOGIC { + id: id, // TenantConf is handled separately default_tenant_conf: TenantConf::default(), concurrent_tenant_warmup: ConfigurableSemaphore::new({ @@ -893,7 +883,7 @@ impl PageServerConf { toml: &Document, workdir: &Utf8Path, ) -> anyhow::Result { - let mut builder = PageServerConfigBuilder::new(node_id); + let mut builder = PageServerConfigBuilder::new(); builder.workdir(workdir.to_owned()); let mut t_conf = TenantConfOpt::default(); @@ -924,8 +914,6 @@ impl PageServerConf { "tenant_config" => { t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; } - "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth - // Logging is not set up yet, so we can't do it. "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), "log_format" => builder.log_format( @@ -1018,7 +1006,7 @@ impl PageServerConf { } } - let mut conf = builder.build().context("invalid config")?; + let mut conf = builder.build(node_id).context("invalid config")?; if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf @@ -1255,7 +1243,6 @@ max_file_descriptors = 333 # initial superuser role name to use when creating a new tenant initial_superuser_name = 'zzzz' -id = 10 metric_collection_interval = '222 s' metric_collection_endpoint = 'http://localhost:80/metrics' @@ -1272,9 +1259,8 @@ background_task_maximum_delay = '334 s' let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; // we have to create dummy values to overcome the validation errors - let config_string = format!( - "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'", - ); + let config_string = + format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",); let toml = config_string.parse()?; let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) @@ -1579,7 +1565,6 @@ broker_endpoint = '{broker_endpoint}' r#"pg_distrib_dir = "{pg_distrib_dir}" metric_collection_endpoint = "http://sample.url" metric_collection_interval = "10min" -id = 222 [disk_usage_based_eviction] max_usage_pct = 80 @@ -1625,7 +1610,7 @@ threshold = "20m" period: Duration::from_secs(10), #[cfg(feature = "testing")] mock_statvfs: None, - eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed, + eviction_order: Default::default(), }) ); @@ -1649,7 +1634,6 @@ threshold = "20m" r#"pg_distrib_dir = "{pg_distrib_dir}" metric_collection_endpoint = "http://sample.url" metric_collection_interval = "10min" -id = 222 [tenant_config] evictions_low_residence_duration_metric_threshold = "20m" diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 9104da6072..f94d945d46 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,5 +1,6 @@ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. +use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::size::CalculateSyntheticSizeError; @@ -39,49 +40,74 @@ type RawMetric = (MetricsKey, (EventType, u64)); /// for deduplication, but that is no longer needed. type Cache = HashMap; +pub async fn run( + conf: &'static PageServerConf, + tenant_manager: Arc, + cancel: CancellationToken, +) { + let Some(metric_collection_endpoint) = conf.metric_collection_endpoint.as_ref() else { + return; + }; + + let local_disk_storage = conf.workdir.join("last_consumption_metrics.json"); + + let metrics_ctx = RequestContext::todo_child( + TaskKind::MetricsCollection, + // This task itself shouldn't download anything. + // The actual size calculation does need downloads, and + // creates a child context with the right DownloadBehavior. + DownloadBehavior::Error, + ); + let collect_metrics = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "consumption metrics collection", + collect_metrics( + tenant_manager.clone(), + metric_collection_endpoint, + &conf.metric_collection_bucket, + conf.metric_collection_interval, + conf.id, + local_disk_storage, + cancel.clone(), + metrics_ctx, + ) + .instrument(info_span!("metrics_collection")), + )); + + let worker_ctx = + RequestContext::todo_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); + let synthetic_size_worker = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "synthetic size calculation", + calculate_synthetic_size_worker( + tenant_manager.clone(), + conf.synthetic_size_calculation_interval, + cancel.clone(), + worker_ctx, + ) + .instrument(info_span!("synthetic_size_worker")), + )); + + let (collect_metrics, synthetic_size_worker) = + futures::future::join(collect_metrics, synthetic_size_worker).await; + collect_metrics + .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process"); + synthetic_size_worker + .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process"); +} + /// Main thread that serves metrics collection #[allow(clippy::too_many_arguments)] -pub async fn collect_metrics( +async fn collect_metrics( tenant_manager: Arc, metric_collection_endpoint: &Url, metric_collection_bucket: &Option, metric_collection_interval: Duration, - synthetic_size_calculation_interval: Duration, node_id: NodeId, local_disk_storage: Utf8PathBuf, cancel: CancellationToken, ctx: RequestContext, ) -> anyhow::Result<()> { - // spin up background worker that caclulates tenant sizes - let worker_ctx = - ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::CalculateSyntheticSize, - None, - None, - "synthetic size calculation", - false, - { - let tenant_manager = tenant_manager.clone(); - async move { - calculate_synthetic_size_worker( - tenant_manager, - synthetic_size_calculation_interval, - &cancel, - &worker_ctx, - ) - .instrument(info_span!("synthetic_size_worker")) - .await?; - Ok(()) - } - }, - ); - let path: Arc = Arc::new(local_disk_storage); - let cancel = task_mgr::shutdown_token(); - let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval); let mut cached_metrics = tokio::select! { @@ -168,11 +194,9 @@ pub async fn collect_metrics( BackgroundLoopKind::ConsumptionMetricsCollectMetrics, ); - let res = tokio::time::timeout_at( - started_at + metric_collection_interval, - task_mgr::shutdown_token().cancelled(), - ) - .await; + let res = + tokio::time::timeout_at(started_at + metric_collection_interval, cancel.cancelled()) + .await; if res.is_ok() { return Ok(()); } @@ -272,8 +296,8 @@ async fn reschedule( async fn calculate_synthetic_size_worker( tenant_manager: Arc, synthetic_size_calculation_interval: Duration, - cancel: &CancellationToken, - ctx: &RequestContext, + cancel: CancellationToken, + ctx: RequestContext, ) -> anyhow::Result<()> { info!("starting calculate_synthetic_size_worker"); scopeguard::defer! { @@ -313,7 +337,7 @@ async fn calculate_synthetic_size_worker( // there is never any reason to exit calculate_synthetic_size_worker following any // return value -- we don't need to care about shutdown because no tenant is found when // pageserver is shut down. - calculate_and_log(&tenant, cancel, ctx).await; + calculate_and_log(&tenant, &cancel, &ctx).await; } crate::tenant::tasks::warn_when_period_overrun( diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 26e7cc7ef8..b5d9267d79 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -171,14 +171,14 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { register, }; - fail::fail_point!("control-plane-client-re-attach"); - let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; tracing::info!( "Received re-attach response with {} tenants", response.tenants.len() ); + failpoint_support::sleep_millis_async!("control-plane-client-re-attach"); + Ok(response .tenants .into_iter() diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 90bd4294bb..92dcf6ee61 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -59,13 +59,14 @@ use utils::{completion, id::TimelineId}; use crate::{ config::PageServerConf, metrics::disk_usage_based_eviction::METRICS, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + task_mgr::{self, BACKGROUND_RUNTIME}, tenant::{ mgr::TenantManager, remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName}, }, + CancellableTask, DiskUsageEvictionTask, }; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -83,17 +84,9 @@ pub struct DiskUsageEvictionTaskConfig { /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. -#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(tag = "type", content = "args")] pub enum EvictionOrder { - /// Order the layers to be evicted by how recently they have been accessed in absolute - /// time. - /// - /// This strategy is unfair when some tenants grow faster than others towards the slower - /// growing. - #[default] - AbsoluteAccessed, - /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. RelativeAccessed { @@ -108,6 +101,14 @@ pub enum EvictionOrder { }, } +impl Default for EvictionOrder { + fn default() -> Self { + Self::RelativeAccessed { + highest_layer_count_loses_first: true, + } + } +} + fn default_highest_layer_count_loses_first() -> bool { true } @@ -117,11 +118,6 @@ impl EvictionOrder { use EvictionOrder::*; match self { - AbsoluteAccessed => { - candidates.sort_unstable_by_key(|(partition, candidate)| { - (*partition, candidate.last_activity_ts) - }); - } RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| { (*partition, candidate.relative_last_activity) }), @@ -134,7 +130,6 @@ impl EvictionOrder { use EvictionOrder::*; match self { - AbsoluteAccessed => finite_f32::FiniteF32::ZERO, RelativeAccessed { highest_layer_count_loses_first, } => { @@ -192,36 +187,34 @@ pub fn launch_disk_usage_global_eviction_task( state: Arc, tenant_manager: Arc, background_jobs_barrier: completion::Barrier, -) -> anyhow::Result<()> { +) -> Option { let Some(task_config) = &conf.disk_usage_based_eviction else { info!("disk usage based eviction task not configured"); - return Ok(()); + return None; }; info!("launching disk usage based eviction task"); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::DiskUsageEviction, - None, - None, + let cancel = CancellationToken::new(); + let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "disk usage based eviction", - false, - async move { - let cancel = task_mgr::shutdown_token(); + { + let cancel = cancel.clone(); + async move { + // wait until initial load is complete, because we cannot evict from loading tenants. + tokio::select! { + _ = cancel.cancelled() => { return anyhow::Ok(()); }, + _ = background_jobs_barrier.wait() => { } + }; - // wait until initial load is complete, because we cannot evict from loading tenants. - tokio::select! { - _ = cancel.cancelled() => { return Ok(()); }, - _ = background_jobs_barrier.wait() => { } - }; - - disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await; - Ok(()) + disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel) + .await; + anyhow::Ok(()) + } }, - ); + )); - Ok(()) + Some(DiskUsageEvictionTask(CancellableTask { cancel, task })) } #[instrument(skip_all)] diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 087d281a0c..59e646d0ca 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -414,7 +414,7 @@ paths: Either archives or unarchives the given timeline. An archived timeline may not have any non-archived children. requestBody: - required: false + required: true content: application/json: schema: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b8063eb5a2..7935aeb5e9 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1650,7 +1650,9 @@ async fn timeline_compact_handler( .await .map_err(|e| ApiError::InternalServerError(e.into()))?; if wait_until_uploaded { - timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + timeline.remote_client.wait_completion().await + // XXX map to correct ApiError for the cases where it's due to shutdown + .context("wait completion").map_err(ApiError::InternalServerError)?; } json_response(StatusCode::OK, ()) } @@ -1676,6 +1678,10 @@ async fn timeline_checkpoint_handler( if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { flags |= CompactFlags::ForceImageLayerCreation; } + + // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload. + let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true); + let wait_until_uploaded = parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); @@ -1692,18 +1698,22 @@ async fn timeline_checkpoint_handler( } })?; - timeline - .compact(&cancel, flags, &ctx) - .await - .map_err(|e| - match e { - CompactionError::ShuttingDown => ApiError::ShuttingDown, - CompactionError::Other(e) => ApiError::InternalServerError(e) - } - )?; + if compact { + timeline + .compact(&cancel, flags, &ctx) + .await + .map_err(|e| + match e { + CompactionError::ShuttingDown => ApiError::ShuttingDown, + CompactionError::Other(e) => ApiError::InternalServerError(e) + } + )?; + } if wait_until_uploaded { - timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + timeline.remote_client.wait_completion().await + // XXX map to correct ApiError for the cases where it's due to shutdown + .context("wait completion").map_err(ApiError::InternalServerError)?; } json_response(StatusCode::OK, ()) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 63c677574f..d944019641 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -13,6 +13,7 @@ pub mod http; pub mod import_datadir; pub mod l0_flush; pub use pageserver_api::keyspace; +use tokio_util::sync::CancellationToken; pub mod aux_file; pub mod metrics; pub mod page_cache; @@ -32,7 +33,10 @@ pub mod walredo; use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; -use tenant::mgr::TenantManager; +use tenant::{ + mgr::{BackgroundPurges, TenantManager}, + secondary, +}; use tracing::info; /// Current storage format version @@ -54,17 +58,39 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; +pub struct CancellableTask { + pub task: tokio::task::JoinHandle<()>, + pub cancel: CancellationToken, +} +pub struct HttpEndpointListener(pub CancellableTask); +pub struct LibpqEndpointListener(pub CancellableTask); +pub struct ConsumptionMetricsTasks(pub CancellableTask); +pub struct DiskUsageEvictionTask(pub CancellableTask); +impl CancellableTask { + pub async fn shutdown(self) { + self.cancel.cancel(); + self.task.await.unwrap(); + } +} + #[tracing::instrument(skip_all, fields(%exit_code))] +#[allow(clippy::too_many_arguments)] pub async fn shutdown_pageserver( + http_listener: HttpEndpointListener, + libpq_listener: LibpqEndpointListener, + consumption_metrics_worker: ConsumptionMetricsTasks, + disk_usage_eviction_task: Option, tenant_manager: &TenantManager, + background_purges: BackgroundPurges, mut deletion_queue: DeletionQueue, + secondary_controller_tasks: secondary::GlobalTasks, exit_code: i32, ) { use std::time::Duration; // Shut down the libpq endpoint task. This prevents new connections from // being accepted. timed( - task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None), + libpq_listener.0.shutdown(), "shutdown LibpqEndpointListener", Duration::from_secs(1), ) @@ -91,16 +117,44 @@ pub async fn shutdown_pageserver( // Best effort to persist any outstanding deletions, to avoid leaking objects deletion_queue.shutdown(Duration::from_secs(5)).await; + timed( + consumption_metrics_worker.0.shutdown(), + "shutdown consumption metrics", + Duration::from_secs(1), + ) + .await; + + timed( + futures::future::OptionFuture::from(disk_usage_eviction_task.map(|t| t.0.shutdown())), + "shutdown disk usage eviction", + Duration::from_secs(1), + ) + .await; + + timed( + background_purges.shutdown(), + "shutdown background purges", + Duration::from_secs(1), + ) + .await; + // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. timed( - task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None), + http_listener.0.shutdown(), "shutdown http", Duration::from_secs(1), ) .await; + timed( + secondary_controller_tasks.wait(), // cancellation happened in caller + "secondary controller wait", + Duration::from_secs(1), + ) + .await; + // There should be nothing left, but let's be sure timed( task_mgr::shutdown_tasks(None, None, None), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c03567f6ef..9aff5220f5 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -3104,6 +3104,8 @@ pub fn preinitialize_metrics() { &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES, &REMOTE_ONDEMAND_DOWNLOADED_LAYERS, &REMOTE_ONDEMAND_DOWNLOADED_BYTES, + &CIRCUIT_BREAKERS_BROKEN, + &CIRCUIT_BREAKERS_UNBROKEN, ] .into_iter() .for_each(|c| { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 00147a8ca6..6353f713e0 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -124,7 +124,6 @@ pub async fn libpq_listener_main( None, None, "serving compute connection task", - false, page_service_conn_main( tenant_manager.clone(), local_auth, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 3bbd084ab4..85f3a6e0fb 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -284,6 +284,16 @@ impl Timeline { if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { return Ok(true); } + // then check if the database was already initialized. + // get_rel_exists can be called before dbdir is created. + let buf = version.get(self, DBDIR_KEY, ctx).await?; + let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => Ok(dir.dbdirs), + Err(e) => Err(PageReconstructError::from(e)), + }?; + if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) { + return Ok(false); + } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); let buf = version.get(self, key, ctx).await?; diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 5f46ce3d69..5cd78874c1 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -408,7 +408,6 @@ pub fn spawn( tenant_shard_id: Option, timeline_id: Option, name: &str, - shutdown_process_on_error: bool, future: F, ) -> PageserverTaskId where @@ -437,7 +436,6 @@ where task_id, task_cloned, cancel, - shutdown_process_on_error, future, )); task_mut.join_handle = Some(join_handle); @@ -454,82 +452,78 @@ async fn task_wrapper( task_id: u64, task: Arc, shutdown_token: CancellationToken, - shutdown_process_on_error: bool, future: F, ) where F: Future> + Send + 'static, { debug!("Starting task '{}'", task_name); - let result = SHUTDOWN_TOKEN - .scope( - shutdown_token, - CURRENT_TASK.scope(task, { - // We use AssertUnwindSafe here so that the payload function - // doesn't need to be UnwindSafe. We don't do anything after the - // unwinding that would expose us to unwind-unsafe behavior. - AssertUnwindSafe(future).catch_unwind() - }), - ) - .await; - task_finish(result, task_name, task_id, shutdown_process_on_error).await; -} - -async fn task_finish( - result: std::result::Result< - anyhow::Result<()>, - std::boxed::Box, - >, - task_name: String, - task_id: u64, - shutdown_process_on_error: bool, -) { - // Remove our entry from the global hashmap. - let task = TASKS - .lock() - .unwrap() - .remove(&task_id) - .expect("no task in registry"); - - let mut shutdown_process = false; - { + // wrap the future so we log panics and errors + let tenant_shard_id = task.tenant_shard_id; + let timeline_id = task.timeline_id; + let fut = async move { + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + let result = AssertUnwindSafe(future).catch_unwind().await; match result { Ok(Ok(())) => { debug!("Task '{}' exited normally", task_name); } Ok(Err(err)) => { - if shutdown_process_on_error { - error!( - "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - shutdown_process = true; - } else { - error!( - "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - } + error!( + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, tenant_shard_id, timeline_id, err + ); } Err(err) => { - if shutdown_process_on_error { - error!( - "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - shutdown_process = true; - } else { - error!( - "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - } + error!( + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, tenant_shard_id, timeline_id, err + ); } } - } + }; - if shutdown_process { - std::process::exit(1); + // add the task-locals + let fut = CURRENT_TASK.scope(task, fut); + let fut = SHUTDOWN_TOKEN.scope(shutdown_token, fut); + + // poll future to completion + fut.await; + + // Remove our entry from the global hashmap. + TASKS + .lock() + .unwrap() + .remove(&task_id) + .expect("no task in registry"); +} + +pub async fn exit_on_panic_or_error( + task_name: &'static str, + future: impl Future>, +) -> T +where + E: std::fmt::Debug, +{ + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + let result = AssertUnwindSafe(future).catch_unwind().await; + match result { + Ok(Ok(val)) => val, + Ok(Err(err)) => { + error!( + task_name, + "Task exited with error, exiting process: {err:?}" + ); + std::process::exit(1); + } + Err(panic_obj) => { + error!(task_name, "Task panicked, exiting process: {panic_obj:?}"); + std::process::exit(1); + } } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 6d59752606..e5ac6725ad 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -31,6 +31,7 @@ use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeoutOrCancel; +use std::collections::BTreeMap; use std::fmt; use std::time::SystemTime; use storage_broker::BrokerClientChannel; @@ -95,16 +96,13 @@ use crate::tenant::storage_layer::ImageLayer; use crate::walredo; use crate::InitializationOrder; use std::collections::hash_map::Entry; -use std::collections::BTreeSet; use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Debug; use std::fmt::Display; use std::fs; use std::fs::File; -use std::ops::Bound::Included; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use std::sync::Mutex; use std::time::{Duration, Instant}; @@ -721,7 +719,6 @@ impl Tenant { Some(tenant_shard_id), None, "attach tenant", - false, async move { info!( @@ -1229,11 +1226,29 @@ impl Tenant { Ok(timeline_preloads) } - pub async fn apply_timeline_archival_config( + pub(crate) async fn apply_timeline_archival_config( &self, - _timeline_id: TimelineId, - _config: TimelineArchivalState, + timeline_id: TimelineId, + state: TimelineArchivalState, ) -> anyhow::Result<()> { + let timeline = self + .get_timeline(timeline_id, false) + .context("Cannot apply timeline archival config to inexistent timeline")?; + + let upload_needed = timeline + .remote_client + .schedule_index_upload_for_timeline_archival_state(state)?; + + if upload_needed { + const MAX_WAIT: Duration = Duration::from_secs(10); + let Ok(v) = + tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await + else { + tracing::warn!("reached timeout for waiting on upload queue"); + bail!("reached timeout for upload queue flush"); + }; + v?; + } Ok(()) } @@ -1618,21 +1633,23 @@ impl Tenant { /// This function is periodically called by compactor task. /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. + /// + /// Returns whether we have pending compaction task. async fn compaction_iteration( &self, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result<(), timeline::CompactionError> { + ) -> Result { // Don't start doing work during shutdown, or when broken, we do not need those in the logs if !self.is_active() { - return Ok(()); + return Ok(false); } { let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); - return Ok(()); + return Ok(false); } } @@ -1659,20 +1676,24 @@ impl Tenant { // Before doing any I/O work, check our circuit breaker if self.compaction_circuit_breaker.lock().unwrap().is_broken() { info!("Skipping compaction due to previous failures"); - return Ok(()); + return Ok(false); } + let mut has_pending_task = false; + for (timeline_id, timeline) in &timelines_to_compact { - timeline + has_pending_task |= timeline .compact(cancel, EnumSet::empty(), ctx) .instrument(info_span!("compact_timeline", %timeline_id)) .await - .map_err(|e| { - self.compaction_circuit_breaker - .lock() - .unwrap() - .fail(&CIRCUIT_BREAKERS_BROKEN, &e); - e + .inspect_err(|e| match e { + timeline::CompactionError::ShuttingDown => (), + timeline::CompactionError::Other(e) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, e); + } })?; } @@ -1681,7 +1702,7 @@ impl Tenant { .unwrap() .success(&CIRCUIT_BREAKERS_UNBROKEN); - Ok(()) + Ok(has_pending_task) } // Call through to all timelines to freeze ephemeral layers if needed. Usually @@ -1766,6 +1787,9 @@ impl Tenant { .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); + // Before activation, populate each Timeline's GcInfo with information about its children + self.initialize_gc_info(&timelines_accessor); + // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. tasks::start_background_loops(self, background_jobs_can_start); @@ -2799,6 +2823,55 @@ impl Tenant { .await } + /// Populate all Timelines' `GcInfo` with information about their children. We do not set the + /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] + /// + /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion. + fn initialize_gc_info( + &self, + timelines: &std::sync::MutexGuard>>, + ) { + // This function must be called before activation: after activation timeline create/delete operations + // might happen, and this function is not safe to run concurrently with those. + assert!(!self.is_active()); + + // Scan all timelines. For each timeline, remember the timeline ID and + // the branch point where it was created. + let mut all_branchpoints: BTreeMap> = BTreeMap::new(); + timelines.iter().for_each(|(timeline_id, timeline_entry)| { + if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { + let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id)); + } + }); + + // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines + let horizon = self.get_gc_horizon(); + + // Populate each timeline's GcInfo with information about its child branches + for timeline in timelines.values() { + let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + .remove(&timeline.timeline_id) + .unwrap_or_default(); + + branchpoints.sort_by_key(|b| b.0); + + let mut target = timeline.gc_info.write().unwrap(); + + target.retain_lsns = branchpoints; + + let space_cutoff = timeline + .get_last_record_lsn() + .checked_sub(horizon) + .unwrap_or(Lsn(0)); + + target.cutoffs = GcCutoffs { + space: space_cutoff, + time: Lsn::INVALID, + }; + } + } + async fn refresh_gc_info_internal( &self, target_timeline_id: Option, @@ -2821,6 +2894,11 @@ impl Tenant { .cloned() .collect::>(); + if target_timeline_id.is_some() && timelines.is_empty() { + // We were to act on a particular timeline and it wasn't found + return Err(GcError::TimelineNotFound); + } + let mut gc_cutoffs: HashMap = HashMap::with_capacity(timelines.len()); @@ -2843,68 +2921,63 @@ impl Tenant { // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; - // Scan all timelines. For each timeline, remember the timeline ID and - // the branch point where it was created. - let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = { - let timelines = self.timelines.lock().unwrap(); - let mut all_branchpoints = BTreeSet::new(); - let timelines = { - if let Some(target_timeline_id) = target_timeline_id.as_ref() { - if timelines.get(target_timeline_id).is_none() { - return Err(GcError::TimelineNotFound); + // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they + // depend on. So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here + // and fail out if it's inaccurate. + // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427) + { + let mut all_branchpoints: BTreeMap> = + BTreeMap::new(); + timelines.iter().for_each(|timeline| { + if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() { + let ancestor_children = + all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id)); + } + }); + + for timeline in &timelines { + let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + .remove(&timeline.timeline_id) + .unwrap_or_default(); + + branchpoints.sort_by_key(|b| b.0); + + let target = timeline.gc_info.read().unwrap(); + + // We require that retain_lsns contains everything in `branchpoints`, but not that + // they are exactly equal: timeline deletions can race with us, so retain_lsns + // may contain some extra stuff. It is safe to have extra timelines in there, because it + // just means that we retain slightly more data than we otherwise might. + let have_branchpoints = target.retain_lsns.iter().copied().collect::>(); + for b in &branchpoints { + if !have_branchpoints.contains(b) { + tracing::error!( + "Bug: `retain_lsns` is set incorrectly. Expected be {:?}, but found {:?}", + branchpoints, + target.retain_lsns + ); + debug_assert!(false); + // Do not GC based on bad information! + // (ab-use an existing GcError type rather than adding a new one, since this is a + // "should never happen" check that will be removed soon). + return Err(GcError::Remote(anyhow::anyhow!( + "retain_lsns failed validation!" + ))); } - }; - - timelines - .iter() - .map(|(_timeline_id, timeline_entry)| { - if let Some(ancestor_timeline_id) = - &timeline_entry.get_ancestor_timeline_id() - { - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timeline_id) = target_timeline_id { - if ancestor_timeline_id == &timeline_id { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - // Collect branchpoints for all timelines - else { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - - timeline_entry.clone() - }) - .collect::>() - }; - (all_branchpoints, timelines) - }; + } + } + } // Ok, we now know all the branch points. // Update the GC information for each timeline. let mut gc_timelines = Vec::with_capacity(timelines.len()); for timeline in timelines { - // If target_timeline is specified, ignore all other timelines + // We filtered the timeline list above if let Some(target_timeline_id) = target_timeline_id { - if timeline.timeline_id != target_timeline_id { - continue; - } + assert_eq!(target_timeline_id, timeline.timeline_id); } - let branchpoints: Vec = all_branchpoints - .range(( - Included((timeline.timeline_id, Lsn(0))), - Included((timeline.timeline_id, Lsn(u64::MAX))), - )) - .map(|&x| x.1) - .collect(); - { let mut target = timeline.gc_info.write().unwrap(); @@ -2942,20 +3015,12 @@ impl Tenant { .0, ); - match gc_cutoffs.remove(&timeline.timeline_id) { - Some(cutoffs) => { - target.retain_lsns = branchpoints; - target.cutoffs = cutoffs; - } - None => { - // reasons for this being unavailable: - // - this timeline was created while we were finding cutoffs - // - lsn for timestamp search fails for this timeline repeatedly - // - // in both cases, refreshing the branchpoints is correct. - target.retain_lsns = branchpoints; - } - }; + // Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline? + // - this timeline was created while we were finding cutoffs + // - lsn for timestamp search fails for this timeline repeatedly + if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) { + target.cutoffs = cutoffs.clone(); + } } gc_timelines.push(timeline); @@ -3993,6 +4058,7 @@ mod tests { use storage_layer::PersistentLayerKey; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; + use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; use timeline::{DeltaLayerTestDesc, GcInfo}; use utils::bin_ser::BeSer; use utils::id::TenantId; @@ -4343,7 +4409,7 @@ mod tests { { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], Lsn(0x40)); + assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID)); } // You can read the key from the child branch even though the parent is @@ -4525,7 +4591,7 @@ mod tests { let layer_map = tline.layers.read().await; let level0_deltas = layer_map .layer_map() - .get_level0_deltas()? + .get_level0_deltas() .into_iter() .map(|desc| layer_map.get_from_desc(&desc)) .collect::>(); @@ -5744,7 +5810,7 @@ mod tests { .read() .await .layer_map() - .get_level0_deltas()? + .get_level0_deltas() .len(); tline.compact(&cancel, EnumSet::empty(), &ctx).await?; @@ -5754,7 +5820,7 @@ mod tests { .read() .await .layer_map() - .get_level0_deltas()? + .get_level0_deltas() .len(); assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); @@ -7215,4 +7281,438 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_generate_key_retention() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_generate_key_retention").await?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + tline.force_advance_lsn(Lsn(0x70)); + let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let history = vec![ + ( + key, + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"0x10")), + ), + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + key, + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ( + key, + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + ), + ( + key, + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + key, + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + key, + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]; + let res = tline + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[Lsn(0x20), Lsn(0x40), Lsn(0x50)], + 3, + ) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![ + ( + Lsn(0x20), + KeyLogAtLsn(vec![( + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20")), + )]), + ), + ( + Lsn(0x40), + KeyLogAtLsn(vec![ + ( + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ]), + ), + ( + Lsn(0x50), + KeyLogAtLsn(vec![( + Lsn(0x50), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40;0x50")), + )]), + ), + ( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + )]), + ), + ], + above_horizon: KeyLogAtLsn(vec![ + ( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]), + }; + assert_eq!(res, expected_res); + + // We expect GC-compaction to run with the original GC. This would create a situation that + // the original GC algorithm removes some delta layers b/c there are full image coverage, + // therefore causing some keys to have an incomplete history below the lowest retain LSN. + // For example, we have + // ```plain + // init delta @ 0x10, image @ 0x20, delta @ 0x30 (gc_horizon), image @ 0x40. + // ``` + // Now the GC horizon moves up, and we have + // ```plain + // init delta @ 0x10, image @ 0x20, delta @ 0x30, image @ 0x40 (gc_horizon) + // ``` + // The original GC algorithm kicks in, and removes delta @ 0x10, image @ 0x20. + // We will end up with + // ```plain + // delta @ 0x30, image @ 0x40 (gc_horizon) + // ``` + // Now we run the GC-compaction, and this key does not have a full history. + // We should be able to handle this partial history and drop everything before the + // gc_horizon image. + + let history = vec![ + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + key, + Lsn(0x40), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), + ), + ( + key, + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + ), + ( + key, + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + key, + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + key, + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]; + let res = tline + .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![ + ( + Lsn(0x40), + KeyLogAtLsn(vec![( + Lsn(0x40), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), + )]), + ), + ( + Lsn(0x50), + KeyLogAtLsn(vec![( + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + )]), + ), + ( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + )]), + ), + ], + above_horizon: KeyLogAtLsn(vec![ + ( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]), + }; + assert_eq!(res, expected_res); + + Ok(()) + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> { + let harness = + TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + ), + ( + get_key(3), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![ + (Lsn(0x10), tline.timeline_id), + (Lsn(0x20), tline.timeline_id), + ], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_20 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_10 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let verify_result = || async { + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x30), &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x20), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_20[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x10), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_10[idx] + ); + } + }; + + verify_result().await; + + let cancel = CancellationToken::new(); + tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + + verify_result().await; + + Ok(()) + } } diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 2724a5cc07..6f150a2d5c 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -463,7 +463,7 @@ impl LayerMap { pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) { // TODO: See #3869, resulting #4088, attempted fix and repro #4094 - if Self::is_l0(&layer_desc) { + if Self::is_l0(&layer_desc.key_range) { self.l0_delta_layers.push(layer_desc.clone().into()); } @@ -482,7 +482,7 @@ impl LayerMap { self.historic .remove(historic_layer_coverage::LayerKey::from(layer_desc)); let layer_key = layer_desc.key(); - if Self::is_l0(layer_desc) { + if Self::is_l0(&layer_desc.key_range) { let len_before = self.l0_delta_layers.len(); let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers); l0_delta_layers.retain(|other| other.key() != layer_key); @@ -598,8 +598,9 @@ impl LayerMap { coverage } - pub fn is_l0(layer: &PersistentLayerDesc) -> bool { - layer.get_key_range() == (Key::MIN..Key::MAX) + /// Check if the key range resembles that of an L0 layer. + pub fn is_l0(key_range: &Range) -> bool { + key_range == &(Key::MIN..Key::MAX) } /// This function determines which layers are counted in `count_deltas`: @@ -626,7 +627,7 @@ impl LayerMap { /// than just the current partition_range. pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range) -> bool { // Case 1 - if !Self::is_l0(layer) { + if !Self::is_l0(&layer.key_range) { return true; } @@ -844,8 +845,8 @@ impl LayerMap { } /// Return all L0 delta layers - pub fn get_level0_deltas(&self) -> Result>> { - Ok(self.l0_delta_layers.to_vec()) + pub fn get_level0_deltas(&self) -> Vec> { + self.l0_delta_layers.to_vec() } /// debugging function to print out the contents of the layer map diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 6ba1bdef9b..bbc070a81b 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -111,7 +111,7 @@ impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader { #[error("re-serializing for crc32 failed")] struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError); -const METADATA_HDR_SIZE: usize = std::mem::size_of::(); +const METADATA_HDR_SIZE: usize = size_of::(); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct TimelineMetadataBodyV2 { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 4912608677..75c8682c97 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -36,7 +36,7 @@ use crate::control_plane_client::{ use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; -use crate::task_mgr::{self, TaskKind}; +use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, }; @@ -225,26 +225,98 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result>); +enum BackgroundPurgesInner { + Open(tokio::task::JoinSet<()>), + // we use the async mutex for coalescing + ShuttingDown(Arc>>), +} - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::MgmtRequest, - task_tenant_id, - None, - "tenant_files_delete", - false, - async move { - fs::remove_dir_all(tmp_path.as_path()) - .await - .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) - }, - ); +impl Default for BackgroundPurges { + fn default() -> Self { + Self(Arc::new(std::sync::Mutex::new( + BackgroundPurgesInner::Open(JoinSet::new()), + ))) + } +} + +impl BackgroundPurges { + /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in + /// the background, and thereby avoid blocking any API requests on this deletion completing. + /// + /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. + /// Thus the [`BackgroundPurges`] type to keep track of these tasks. + pub fn spawn(&self, tmp_path: Utf8PathBuf) { + let mut guard = self.0.lock().unwrap(); + let jset = match &mut *guard { + BackgroundPurgesInner::Open(ref mut jset) => jset, + BackgroundPurgesInner::ShuttingDown(_) => { + warn!("trying to spawn background purge during shutdown, ignoring"); + return; + } + }; + jset.spawn_on( + async move { + if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await { + // should we fatal_io_error here? + warn!(%error, path=%tmp_path, "failed to purge tenant directory"); + } + } + .instrument(info_span!(parent: None, "background_purge")), + BACKGROUND_RUNTIME.handle(), + ); + } + + /// When this future completes, all background purges have completed. + /// The first poll of the future will already lock out new background purges spawned via [`Self::spawn`]. + /// + /// Concurrent calls will coalesce. + /// + /// # Cancellation-Safety + /// + /// If this future is dropped before polled to completion, concurrent and subsequent + /// instances of this future will continue to be correct. + #[instrument(skip_all)] + pub async fn shutdown(&self) { + let jset = { + let mut guard = self.0.lock().unwrap(); + match &mut *guard { + BackgroundPurgesInner::Open(jset) => { + *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new( + std::mem::take(jset), + ))) + } + BackgroundPurgesInner::ShuttingDown(_) => { + // calling shutdown multiple times is most likely a bug in pageserver shutdown code + warn!("already shutting down"); + } + }; + match &mut *guard { + BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(), + BackgroundPurgesInner::Open(_) => { + unreachable!("above code transitions into shut down state"); + } + } + }; + let mut jset = jset.lock().await; // concurrent callers coalesce here + while let Some(res) = jset.join_next().await { + match res { + Ok(()) => {} + Err(e) if e.is_panic() => { + // If it panicked, the error is already logged by the panic hook. + } + Err(e) if e.is_cancelled() => { + unreachable!("we don't cancel the joinset or runtime") + } + Err(e) => { + // No idea when this can happen, but let's log it. + warn!(%e, "background purge task failed or panicked"); + } + } + } + } } static TENANTS: Lazy> = @@ -270,6 +342,8 @@ pub struct TenantManager { // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or // when the tenant detaches. cancel: CancellationToken, + + background_purges: BackgroundPurges, } fn emergency_generations( @@ -447,6 +521,7 @@ pub(crate) enum DeleteTenantError { #[instrument(skip_all)] pub async fn init_tenant_mgr( conf: &'static PageServerConf, + background_purges: BackgroundPurges, resources: TenantSharedResources, init_order: InitializationOrder, cancel: CancellationToken, @@ -512,7 +587,7 @@ pub async fn init_tenant_mgr( match safe_rename_tenant_dir(&tenant_dir_path).await { Ok(tmp_path) => { - spawn_background_purge(tmp_path); + background_purges.spawn(tmp_path); } Err(e) => { error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), @@ -634,6 +709,7 @@ pub async fn init_tenant_mgr( tenants: &TENANTS, resources, cancel: CancellationToken::new(), + background_purges, }) } @@ -1331,6 +1407,7 @@ impl TenantManager { tracing::info!("Remote storage already deleted"); } else { tracing::info!("Deleting {} keys from remote storage", keys.len()); + let keys = keys.into_iter().map(|o| o.key).collect::>(); self.resources .remote_storage .delete_objects(&keys, &self.cancel) @@ -1353,6 +1430,7 @@ impl TenantManager { async fn delete_local( conf: &PageServerConf, + background_purges: &BackgroundPurges, tenant_shard_id: &TenantShardId, ) -> anyhow::Result<()> { let local_tenant_directory = conf.tenant_path(tenant_shard_id); @@ -1361,7 +1439,7 @@ impl TenantManager { .with_context(|| { format!("local tenant directory {local_tenant_directory:?} rename") })?; - spawn_background_purge(tmp_dir); + background_purges.spawn(tmp_dir); Ok(()) } @@ -1379,12 +1457,12 @@ impl TenantManager { barrier.wait().await; } } - delete_local(self.conf, &tenant_shard_id).await?; + delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?; } Some(TenantSlot::Secondary(secondary_tenant)) => { secondary_tenant.shutdown().await; - delete_local(self.conf, &tenant_shard_id).await?; + delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?; } Some(TenantSlot::InProgress(_)) => unreachable!(), None => {} @@ -1655,7 +1733,7 @@ impl TenantManager { let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; - spawn_background_purge(tmp_path); + self.background_purges.spawn(tmp_path); fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( "failpoint" @@ -1831,7 +1909,7 @@ impl TenantManager { let tmp_path = self .detach_tenant0(conf, tenant_shard_id, deletion_queue_client) .await?; - spawn_background_purge(tmp_path); + self.background_purges.spawn(tmp_path); Ok(()) } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index bb42fbeebf..fed666ca45 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -187,7 +187,7 @@ use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; pub(crate) use download::download_initdb_tar_zst; -use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState}; use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -287,6 +287,14 @@ pub enum PersistIndexPartWithDeletedFlagError { Other(#[from] anyhow::Error), } +#[derive(Debug, thiserror::Error)] +pub enum WaitCompletionError { + #[error(transparent)] + NotInitialized(NotInitialized), + #[error("wait_completion aborted because upload queue was stopped")] + UploadQueueShutDownOrStopped, +} + /// A client for accessing a timeline's data in remote storage. /// /// This takes care of managing the number of connections, and balancing them @@ -449,6 +457,17 @@ impl RemoteTimelineClient { .unwrap_or(false) } + /// Returns whether the timeline is archived. + /// Return None if the remote index_part hasn't been downloaded yet. + pub(crate) fn is_archived(&self) -> Option { + self.upload_queue + .lock() + .unwrap() + .initialized_mut() + .map(|q| q.clean.0.archived_at.is_some()) + .ok() + } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { current_remote_index_part @@ -609,7 +628,7 @@ impl RemoteTimelineClient { Ok(()) } - /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated. + /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated. pub(crate) fn schedule_index_upload_for_aux_file_policy_update( self: &Arc, last_aux_file_policy: Option, @@ -620,6 +639,48 @@ impl RemoteTimelineClient { self.schedule_index_upload(upload_queue)?; Ok(()) } + + /// Launch an index-file upload operation in the background, with only the `archived_at` field updated. + /// + /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded, + /// so either if the change is already sitting in the queue, but not commited yet, or the change has not + /// been in the queue yet. + pub(crate) fn schedule_index_upload_for_timeline_archival_state( + self: &Arc, + state: TimelineArchivalState, + ) -> anyhow::Result { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + /// Returns Some(_) if a change is needed, and Some(true) if it's a + /// change needed to set archived_at. + fn need_change( + archived_at: &Option, + state: TimelineArchivalState, + ) -> Option { + match (archived_at, state) { + (Some(_), TimelineArchivalState::Archived) + | (None, TimelineArchivalState::Unarchived) => { + // Nothing to do + tracing::info!("intended state matches present state"); + None + } + (None, TimelineArchivalState::Archived) => Some(true), + (Some(_), TimelineArchivalState::Unarchived) => Some(false), + } + } + let need_upload_scheduled = need_change(&upload_queue.dirty.archived_at, state); + + if let Some(archived_at_set) = need_upload_scheduled { + let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc()); + upload_queue.dirty.archived_at = intended_archived_at; + self.schedule_index_upload(upload_queue)?; + } + + let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some(); + Ok(need_wait) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -630,7 +691,7 @@ impl RemoteTimelineClient { /// /// Like schedule_index_upload_for_metadata_update(), this merely adds /// the upload to the upload queue and returns quickly. - pub fn schedule_index_upload_for_file_changes(self: &Arc) -> anyhow::Result<()> { + pub fn schedule_index_upload_for_file_changes(self: &Arc) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -645,7 +706,7 @@ impl RemoteTimelineClient { fn schedule_index_upload( self: &Arc, upload_queue: &mut UploadQueueInitialized, - ) -> anyhow::Result<()> { + ) -> Result<(), NotInitialized> { let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); // fix up the duplicated field upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn; @@ -653,7 +714,7 @@ impl RemoteTimelineClient { // make sure it serializes before doing it in perform_upload_task so that it doesn't // look like a retryable error let void = std::io::sink(); - serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?; + serde_json::to_writer(void, &upload_queue.dirty).expect("serialize index_part.json"); let index_part = &upload_queue.dirty; @@ -699,7 +760,9 @@ impl RemoteTimelineClient { self.schedule_barrier0(upload_queue) }; - Self::wait_completion0(receiver).await + Self::wait_completion0(receiver) + .await + .context("wait completion") } /// Schedules uploading a new version of `index_part.json` with the given layers added, @@ -732,7 +795,9 @@ impl RemoteTimelineClient { barrier }; - Self::wait_completion0(barrier).await + Self::wait_completion0(barrier) + .await + .context("wait completion") } /// Launch an upload operation in the background; the file is added to be included in next @@ -740,7 +805,7 @@ impl RemoteTimelineClient { pub(crate) fn schedule_layer_file_upload( self: &Arc, layer: ResidentLayer, - ) -> anyhow::Result<()> { + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -826,7 +891,7 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> anyhow::Result> + ) -> Result, NotInitialized> where I: IntoIterator, { @@ -952,7 +1017,7 @@ impl RemoteTimelineClient { self: &Arc, compacted_from: &[Layer], compacted_to: &[ResidentLayer], - ) -> anyhow::Result<()> { + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -969,10 +1034,12 @@ impl RemoteTimelineClient { } /// Wait for all previously scheduled uploads/deletions to complete - pub(crate) async fn wait_completion(self: &Arc) -> anyhow::Result<()> { + pub(crate) async fn wait_completion(self: &Arc) -> Result<(), WaitCompletionError> { let receiver = { let mut guard = self.upload_queue.lock().unwrap(); - let upload_queue = guard.initialized_mut()?; + let upload_queue = guard + .initialized_mut() + .map_err(WaitCompletionError::NotInitialized)?; self.schedule_barrier0(upload_queue) }; @@ -981,9 +1048,9 @@ impl RemoteTimelineClient { async fn wait_completion0( mut receiver: tokio::sync::watch::Receiver<()>, - ) -> anyhow::Result<()> { + ) -> Result<(), WaitCompletionError> { if receiver.changed().await.is_err() { - anyhow::bail!("wait_completion aborted because upload queue was stopped"); + return Err(WaitCompletionError::UploadQueueShutDownOrStopped); } Ok(()) @@ -1366,12 +1433,13 @@ impl RemoteTimelineClient { // marker via its deleted_at attribute let latest_index = remaining .iter() - .filter(|p| { - p.object_name() + .filter(|o| { + o.key + .object_name() .map(|n| n.starts_with(IndexPart::FILE_NAME)) .unwrap_or(false) }) - .filter_map(|path| parse_remote_index_path(path.clone()).map(|gen| (path, gen))) + .filter_map(|o| parse_remote_index_path(o.key.clone()).map(|gen| (o.key.clone(), gen))) .max_by_key(|i| i.1) .map(|i| i.0.clone()) .unwrap_or( @@ -1382,14 +1450,12 @@ impl RemoteTimelineClient { let remaining_layers: Vec = remaining .into_iter() - .filter(|p| { - if p == &latest_index { - return false; + .filter_map(|o| { + if o.key == latest_index || o.key.object_name() == Some(INITDB_PRESERVED_PATH) { + None + } else { + Some(o.key) } - if p.object_name() == Some(INITDB_PRESERVED_PATH) { - return false; - } - true }) .inspect(|path| { if let Some(name) = path.object_name() { @@ -1525,7 +1591,6 @@ impl RemoteTimelineClient { Some(self.tenant_shard_id), Some(self.timeline_id), "remote upload", - false, async move { self_rc.perform_upload_task(task).await; Ok(()) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d0385e4aee..a17b32c983 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -295,10 +295,11 @@ where }; } - for key in listing.keys { - let object_name = key + for object in listing.keys { + let object_name = object + .key .object_name() - .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?; + .ok_or_else(|| anyhow::anyhow!("object name for key {}", object.key))?; other_prefixes.insert(object_name.to_string()); } @@ -459,7 +460,7 @@ pub(crate) async fn download_index_part( // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md let max_previous_generation = indices .into_iter() - .filter_map(parse_remote_index_path) + .filter_map(|o| parse_remote_index_path(o.key)) .filter(|g| g <= &my_generation) .max(); diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index b439df8edb..3075df022e 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -32,6 +32,10 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none")] pub deleted_at: Option, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub archived_at: Option, + /// Per layer file name metadata, which can be present for a present or missing layer file. /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata @@ -80,10 +84,11 @@ impl IndexPart { /// - 5: lineage was added /// - 6: last_aux_file_policy is added. /// - 7: metadata_bytes is no longer written, but still read - const LATEST_VERSION: usize = 7; + /// - 8: added `archived_at` + const LATEST_VERSION: usize = 8; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -94,6 +99,7 @@ impl IndexPart { disk_consistent_lsn: metadata.disk_consistent_lsn(), metadata, deleted_at: None, + archived_at: None, lineage: Default::default(), last_aux_file_policy: None, } @@ -284,6 +290,7 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage::default(), last_aux_file_policy: None, }; @@ -326,6 +333,7 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage::default(), last_aux_file_policy: None, }; @@ -369,6 +377,7 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, lineage: Lineage::default(), last_aux_file_policy: None, }; @@ -415,6 +424,7 @@ mod tests { ]) .unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage::default(), last_aux_file_policy: None, }; @@ -456,6 +466,7 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, lineage: Lineage::default(), last_aux_file_policy: None, }; @@ -496,6 +507,7 @@ mod tests { disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(), metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage { reparenting_history_truncated: false, reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], @@ -545,6 +557,7 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, lineage: Lineage { reparenting_history_truncated: false, reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], @@ -603,6 +616,63 @@ mod tests { 14, ).with_recalculated_checksum().unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, + lineage: Default::default(), + last_aux_file_policy: Default::default(), + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v8_indexpart_is_parsed() { + let example = r#"{ + "version": 8, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "deleted_at": "2023-07-31T09:00:00.123", + "archived_at": "2023-04-29T09:00:00.123" + }"#; + + let expected = IndexPart { + version: 8, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")), lineage: Default::default(), last_aux_file_policy: Default::default(), }; diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index a233d11c4a..3132a28b12 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -31,6 +31,7 @@ use pageserver_api::{ }; use remote_storage::GenericRemoteStorage; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::instrument; use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate}; @@ -293,15 +294,50 @@ impl SecondaryController { } } +pub struct GlobalTasks { + cancel: CancellationToken, + uploader: JoinHandle<()>, + downloader: JoinHandle<()>, +} + +impl GlobalTasks { + /// Caller is responsible for requesting shutdown via the cancellation token that was + /// passed to [`spawn_tasks`]. + /// + /// # Panics + /// + /// This method panics if that token is not cancelled. + /// This is low-risk because we're calling this during process shutdown, so, a panic + /// will be informative but not cause undue downtime. + pub async fn wait(self) { + let Self { + cancel, + uploader, + downloader, + } = self; + assert!( + cancel.is_cancelled(), + "must cancel cancellation token, otherwise the tasks will not shut down" + ); + + let (uploader, downloader) = futures::future::join(uploader, downloader).await; + uploader.expect( + "unreachable: exit_on_panic_or_error would catch the panic and exit the process", + ); + downloader.expect( + "unreachable: exit_on_panic_or_error would catch the panic and exit the process", + ); + } +} + pub fn spawn_tasks( tenant_manager: Arc, remote_storage: GenericRemoteStorage, background_jobs_can_start: Barrier, cancel: CancellationToken, -) -> SecondaryController { +) -> (SecondaryController, GlobalTasks) { let mgr_clone = tenant_manager.clone(); let storage_clone = remote_storage.clone(); - let cancel_clone = cancel.clone(); let bg_jobs_clone = background_jobs_can_start.clone(); let (download_req_tx, download_req_rx) = @@ -309,17 +345,9 @@ pub fn spawn_tasks( let (upload_req_tx, upload_req_rx) = tokio::sync::mpsc::channel::>(16); - let downloader_task_ctx = RequestContext::new( - TaskKind::SecondaryDownloads, - crate::context::DownloadBehavior::Download, - ); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - downloader_task_ctx.task_kind(), - None, - None, + let cancel_clone = cancel.clone(); + let downloader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "secondary tenant downloads", - false, async move { downloader_task( mgr_clone, @@ -327,49 +355,41 @@ pub fn spawn_tasks( download_req_rx, bg_jobs_clone, cancel_clone, - downloader_task_ctx, + RequestContext::new( + TaskKind::SecondaryDownloads, + crate::context::DownloadBehavior::Download, + ), ) .await; - - Ok(()) + anyhow::Ok(()) }, - ); + )); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::SecondaryUploads, - None, - None, + let cancel_clone = cancel.clone(); + let uploader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "heatmap uploads", - false, async move { heatmap_uploader_task( tenant_manager, remote_storage, upload_req_rx, background_jobs_can_start, - cancel, + cancel_clone, ) .await; - - Ok(()) + anyhow::Ok(()) }, - ); + )); - SecondaryController { - download_req_tx, - upload_req_tx, - } -} - -/// For running with remote storage disabled: a SecondaryController that is connected to nothing. -pub fn null_controller() -> SecondaryController { - let (download_req_tx, _download_req_rx) = - tokio::sync::mpsc::channel::>(16); - let (upload_req_tx, _upload_req_rx) = - tokio::sync::mpsc::channel::>(16); - SecondaryController { - upload_req_tx, - download_req_tx, - } + ( + SecondaryController { + upload_req_tx, + download_req_tx, + }, + GlobalTasks { + cancel, + uploader, + downloader, + }, + ) } diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index e4728ca8a8..41d558d3f6 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -264,10 +264,10 @@ pub(super) async fn gather_inputs( let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() - .filter(|&&lsn| lsn > ancestor_lsn) + .filter(|(lsn, _child_id)| lsn > &ancestor_lsn) .copied() // this assumes there are no other retain_lsns than the branchpoints - .map(|lsn| (lsn, LsnKind::BranchPoint)) + .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint)) .collect::>(); lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index a389358f0d..f931341aca 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -10,29 +10,18 @@ pub mod merge_iterator; use crate::context::{AccessStatsBehavior, RequestContext}; use crate::repository::Value; -use crate::task_mgr::TaskKind; use crate::walrecord::NeonWalRecord; use bytes::Bytes; -use enum_map::EnumMap; -use enumset::EnumSet; -use once_cell::sync::Lazy; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; -use pageserver_api::models::{ - LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, -}; -use std::borrow::Cow; use std::cmp::{Ordering, Reverse}; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; use std::ops::Range; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use tracing::warn; -use utils::history_buffer::HistoryBufferWithDropCounter; -use utils::rate_limit::RateLimit; -use utils::{id::TimelineId, lsn::Lsn}; +use utils::lsn::Lsn; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; pub use image_layer::{ImageLayer, ImageLayerWriter}; @@ -75,9 +64,9 @@ where /// call, to collect more records. /// #[derive(Debug, Default)] -pub struct ValueReconstructState { - pub records: Vec<(Lsn, NeonWalRecord)>, - pub img: Option<(Lsn, Bytes)>, +pub(crate) struct ValueReconstructState { + pub(crate) records: Vec<(Lsn, NeonWalRecord)>, + pub(crate) img: Option<(Lsn, Bytes)>, } #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] @@ -458,94 +447,92 @@ pub enum ValueReconstructResult { Missing, } -#[derive(Debug)] -pub struct LayerAccessStats(Mutex); - -/// This struct holds two instances of [`LayerAccessStatsInner`]. -/// Accesses are recorded to both instances. -/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`]. -/// The `for_eviction_policy` is never reset. -#[derive(Debug, Default, Clone)] -struct LayerAccessStatsLocked { - for_scraping_api: LayerAccessStatsInner, - for_eviction_policy: LayerAccessStatsInner, +/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather +/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility +/// of layers (for example when creating a branch that makes some previously covered layers visible). It should +/// be used for cache management but not for correctness-critical checks. +#[derive(Default, Debug, Clone, PartialEq, Eq)] +pub(crate) enum LayerVisibilityHint { + /// A Visible layer might be read while serving a read, because there is not an image layer between it + /// and a readable LSN (the tip of the branch or a child's branch point) + Visible, + /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates + /// a branch or ephemeral endpoint at an LSN below the layer that covers this. + #[allow(unused)] + Covered, + /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded + /// in this state. Note that newly written layers may be called Visible immediately, this uninitialized + /// state is for when existing layers are constructed while loading a timeline. + #[default] + Uninitialized, } -impl LayerAccessStatsLocked { - fn iter_mut(&mut self) -> impl Iterator { - [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter() - } -} - -#[derive(Debug, Default, Clone)] -struct LayerAccessStatsInner { - first_access: Option, - count_by_access_kind: EnumMap, - task_kind_flag: EnumSet, - last_accesses: HistoryBufferWithDropCounter, - last_residence_changes: HistoryBufferWithDropCounter, -} - -#[derive(Debug, Clone, Copy)] -pub(crate) struct LayerAccessStatFullDetails { - pub(crate) when: SystemTime, - pub(crate) task_kind: TaskKind, - pub(crate) access_kind: LayerAccessKind, -} +pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64); #[derive(Clone, Copy, strum_macros::EnumString)] -pub enum LayerAccessStatsReset { +pub(crate) enum LayerAccessStatsReset { NoReset, - JustTaskKindFlags, AllStats, } -fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 { - ts.duration_since(UNIX_EPOCH) - .expect("better to die in this unlikely case than report false stats") - .as_millis() - .try_into() - .expect("64 bits is enough for few more years") -} +impl Default for LayerAccessStats { + fn default() -> Self { + // Default value is to assume resident since creation time, and visible. + let (_mask, mut value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, SystemTime::now()); + value |= 0x1 << Self::VISIBILITY_SHIFT; -impl LayerAccessStatFullDetails { - fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails { - let Self { - when, - task_kind, - access_kind, - } = self; - pageserver_api::models::LayerAccessStatFullDetails { - when_millis_since_epoch: system_time_to_millis_since_epoch(when), - task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros - access_kind: *access_kind, - } + Self(std::sync::atomic::AtomicU64::new(value)) } } +// Efficient store of two very-low-resolution timestamps and some bits. Used for storing last access time and +// last residence change time. impl LayerAccessStats { - /// Create an empty stats object. - /// - /// The caller is responsible for recording a residence event - /// using [`record_residence_event`] before calling `latest_activity`. - /// If they don't, [`latest_activity`] will return `None`. - /// - /// [`record_residence_event`]: Self::record_residence_event - /// [`latest_activity`]: Self::latest_activity - pub(crate) fn empty_will_record_residence_event_later() -> Self { - LayerAccessStats(Mutex::default()) + // How many high bits to drop from a u32 timestamp? + // - Only storing up to a u32 timestamp will work fine until 2038 (if this code is still in use + // after that, this software has been very successful!) + // - Dropping the top bit is implicitly safe because unix timestamps are meant to be + // stored in an i32, so they never used it. + // - Dropping the next two bits is safe because this code is only running on systems in + // years >= 2024, and these bits have been 1 since 2021 + // + // Therefore we may store only 28 bits for a timestamp with one second resolution. We do + // this truncation to make space for some flags in the high bits of our u64. + const TS_DROP_HIGH_BITS: u32 = u32::count_ones(Self::TS_ONES) + 1; + const TS_MASK: u32 = 0x1f_ff_ff_ff; + const TS_ONES: u32 = 0x60_00_00_00; + + const ATIME_SHIFT: u32 = 0; + const RTIME_SHIFT: u32 = 32 - Self::TS_DROP_HIGH_BITS; + const VISIBILITY_SHIFT: u32 = 64 - 2 * Self::TS_DROP_HIGH_BITS; + + fn write_bits(&self, mask: u64, value: u64) -> u64 { + self.0 + .fetch_update( + // TODO: decide what orderings are correct + std::sync::atomic::Ordering::Relaxed, + std::sync::atomic::Ordering::Relaxed, + |v| Some((v & !mask) | (value & mask)), + ) + .expect("Inner function is infallible") } - /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status. - /// - /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. - /// - /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad - /// [`record_residence_event`]: Self::record_residence_event - pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self { - let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); - new.record_residence_event(status, LayerResidenceEventReason::LayerLoad); - new + fn to_low_res_timestamp(shift: u32, time: SystemTime) -> (u64, u64) { + // Drop the low three bits of the timestamp, for an ~8s accuracy + let timestamp = time.duration_since(UNIX_EPOCH).unwrap().as_secs() & (Self::TS_MASK as u64); + + ((Self::TS_MASK as u64) << shift, timestamp << shift) + } + + fn read_low_res_timestamp(&self, shift: u32) -> Option { + let read = self.0.load(std::sync::atomic::Ordering::Relaxed); + + let ts_bits = (read & ((Self::TS_MASK as u64) << shift)) >> shift; + if ts_bits == 0 { + None + } else { + Some(UNIX_EPOCH + Duration::from_secs(ts_bits | (Self::TS_ONES as u64))) + } } /// Record a change in layer residency. @@ -561,117 +548,64 @@ impl LayerAccessStats { /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map. /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock. /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event. - /// - pub(crate) fn record_residence_event( - &self, - status: LayerResidenceStatus, - reason: LayerResidenceEventReason, - ) { - let mut locked = self.0.lock().unwrap(); - locked.iter_mut().for_each(|inner| { - inner - .last_residence_changes - .write(LayerResidenceEvent::new(status, reason)) - }); + pub(crate) fn record_residence_event_at(&self, now: SystemTime) { + let (mask, value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, now); + self.write_bits(mask, value); } - fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) { + pub(crate) fn record_residence_event(&self) { + self.record_residence_event_at(SystemTime::now()) + } + + pub(crate) fn record_access_at(&self, now: SystemTime) { + let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now); + + // A layer which is accessed must be visible. + mask |= 0x1 << Self::VISIBILITY_SHIFT; + value |= 0x1 << Self::VISIBILITY_SHIFT; + + self.write_bits(mask, value); + } + + pub(crate) fn record_access(&self, ctx: &RequestContext) { if ctx.access_stats_behavior() == AccessStatsBehavior::Skip { return; } - let this_access = LayerAccessStatFullDetails { - when: SystemTime::now(), - task_kind: ctx.task_kind(), - access_kind, - }; - - let mut locked = self.0.lock().unwrap(); - locked.iter_mut().for_each(|inner| { - inner.first_access.get_or_insert(this_access); - inner.count_by_access_kind[access_kind] += 1; - inner.task_kind_flag |= ctx.task_kind(); - inner.last_accesses.write(this_access); - }) + self.record_access_at(SystemTime::now()) } fn as_api_model( &self, reset: LayerAccessStatsReset, ) -> pageserver_api::models::LayerAccessStats { - let mut locked = self.0.lock().unwrap(); - let inner = &mut locked.for_scraping_api; - let LayerAccessStatsInner { - first_access, - count_by_access_kind, - task_kind_flag, - last_accesses, - last_residence_changes, - } = inner; let ret = pageserver_api::models::LayerAccessStats { - access_count_by_access_kind: count_by_access_kind - .iter() - .map(|(kind, count)| (kind, *count)) - .collect(), - task_kind_access_flag: task_kind_flag - .iter() - .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros - .collect(), - first: first_access.as_ref().map(|a| a.as_api_model()), - accesses_history: last_accesses.map(|m| m.as_api_model()), - residence_events_history: last_residence_changes.clone(), + access_time: self + .read_low_res_timestamp(Self::ATIME_SHIFT) + .unwrap_or(UNIX_EPOCH), + residence_time: self + .read_low_res_timestamp(Self::RTIME_SHIFT) + .unwrap_or(UNIX_EPOCH), + visible: matches!(self.visibility(), LayerVisibilityHint::Visible), }; match reset { - LayerAccessStatsReset::NoReset => (), - LayerAccessStatsReset::JustTaskKindFlags => { - inner.task_kind_flag.clear(); - } + LayerAccessStatsReset::NoReset => {} LayerAccessStatsReset::AllStats => { - *inner = LayerAccessStatsInner::default(); + self.write_bits((Self::TS_MASK as u64) << Self::ATIME_SHIFT, 0x0); + self.write_bits((Self::TS_MASK as u64) << Self::RTIME_SHIFT, 0x0); } } ret } - /// Get the latest access timestamp, falling back to latest residence event, further falling - /// back to `SystemTime::now` for a usable timestamp for eviction. - pub(crate) fn latest_activity_or_now(&self) -> SystemTime { - self.latest_activity().unwrap_or_else(SystemTime::now) - } - - /// Get the latest access timestamp, falling back to latest residence event. - /// - /// This function can only return `None` if there has not yet been a call to the - /// [`record_residence_event`] method. That would generally be considered an - /// implementation error. This function logs a rate-limited warning in that case. - /// - /// TODO: use type system to avoid the need for `fallback`. - /// The approach in - /// could be used to enforce that a residence event is recorded - /// before a layer is added to the layer map. We could also have - /// a layer wrapper type that holds the LayerAccessStats, and ensure - /// that that type can only be produced by inserting into the layer map. - /// - /// [`record_residence_event`]: Self::record_residence_event - fn latest_activity(&self) -> Option { - let locked = self.0.lock().unwrap(); - let inner = &locked.for_eviction_policy; - match inner.last_accesses.recent() { - Some(a) => Some(a.when), - None => match inner.last_residence_changes.recent() { - Some(e) => Some(e.timestamp), - None => { - static WARN_RATE_LIMIT: Lazy> = - Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10))))); - let mut guard = WARN_RATE_LIMIT.lock().unwrap(); - guard.0 += 1; - let occurences = guard.0; - guard.1.call(move || { - warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value"); - }); - None - } - }, + /// Get the latest access timestamp, falling back to latest residence event. The latest residence event + /// will be this Layer's construction time, if its residence hasn't changed since then. + pub(crate) fn latest_activity(&self) -> SystemTime { + if let Some(t) = self.read_low_res_timestamp(Self::ATIME_SHIFT) { + t + } else { + self.read_low_res_timestamp(Self::RTIME_SHIFT) + .expect("Residence time is set on construction") } } @@ -680,30 +614,46 @@ impl LayerAccessStats { /// This indicates whether the layer has been used for some purpose that would motivate /// us to keep it on disk, such as for serving a getpage request. fn accessed(&self) -> bool { - let locked = self.0.lock().unwrap(); - let inner = &locked.for_eviction_policy; - // Consider it accessed if the most recent access is more recent than // the most recent change in residence status. match ( - inner.last_accesses.recent(), - inner.last_residence_changes.recent(), + self.read_low_res_timestamp(Self::ATIME_SHIFT), + self.read_low_res_timestamp(Self::RTIME_SHIFT), ) { (None, _) => false, (Some(_), None) => true, - (Some(a), Some(r)) => a.when >= r.timestamp, + (Some(a), Some(r)) => a >= r, + } + } + + pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) { + let value = match visibility { + LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT, + LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0, + }; + + self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value); + } + + pub(crate) fn visibility(&self) -> LayerVisibilityHint { + let read = self.0.load(std::sync::atomic::Ordering::Relaxed); + match (read >> Self::VISIBILITY_SHIFT) & 0x1 { + 1 => LayerVisibilityHint::Visible, + 0 => LayerVisibilityHint::Covered, + _ => unreachable!(), } } } /// Get a layer descriptor from a layer. -pub trait AsLayerDesc { +pub(crate) trait AsLayerDesc { /// Get the layer descriptor. fn layer_desc(&self) -> &PersistentLayerDesc; } pub mod tests { use pageserver_api::shard::TenantShardId; + use utils::id::TimelineId; use super::*; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 512e9e86fa..229d1e3608 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -52,7 +52,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind}; +use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; @@ -265,7 +265,7 @@ impl DeltaLayer { return Ok(()); } - let inner = self.load(LayerAccessKind::Dump, ctx).await?; + let inner = self.load(ctx).await?; inner.dump(ctx).await } @@ -298,12 +298,8 @@ impl DeltaLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - async fn load( - &self, - access_kind: LayerAccessKind, - ctx: &RequestContext, - ) -> Result<&Arc> { - self.access_stats.record_access(access_kind, ctx); + async fn load(&self, ctx: &RequestContext) -> Result<&Arc> { + self.access_stats.record_access(ctx); // Quick exit if already loaded self.inner .get_or_try_init(|| self.load_inner(ctx)) @@ -311,12 +307,10 @@ impl DeltaLayer { .with_context(|| format!("Failed to load delta layer {}", self.path())) } - async fn load_inner(&self, ctx: &RequestContext) -> Result> { + async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result> { let path = self.path(); - let loaded = DeltaLayerInner::load(&path, None, None, ctx) - .await - .and_then(|res| res)?; + let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?; // not production code let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); @@ -356,7 +350,7 @@ impl DeltaLayer { summary.lsn_range, metadata.len(), ), - access_stats: LayerAccessStats::empty_will_record_residence_event_later(), + access_stats: Default::default(), inner: OnceCell::new(), }) } @@ -460,7 +454,12 @@ impl DeltaLayerWriterInner { will_init: bool, ctx: &RequestContext, ) -> (Vec, anyhow::Result<()>) { - assert!(self.lsn_range.start <= lsn); + assert!( + self.lsn_range.start <= lsn, + "lsn_start={}, lsn={}", + self.lsn_range.start, + lsn + ); // We don't want to use compression in delta layer creation let compression = ImageCompressionAlgorithm::Disabled; let (val, res) = self @@ -759,27 +758,24 @@ impl DeltaLayerInner { &self.layer_lsn_range } - /// Returns nested result following Result, Critical>: - /// - inner has the success or transient failure - /// - outer has the permanent failure pub(super) async fn load( path: &Utf8Path, summary: Option, max_vectored_read_bytes: Option, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path, ctx).await { - Ok(file) => file, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), - }; + ) -> anyhow::Result { + let file = VirtualFile::open(path, ctx) + .await + .context("open layer file")?; + let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); - let summary_blk = match block_reader.read_blk(0, ctx).await { - Ok(blk) => blk, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), - }; + let summary_blk = block_reader + .read_blk(0, ctx) + .await + .context("read first block")?; // TODO: this should be an assertion instead; see ImageLayerInner::load let actual_summary = @@ -801,7 +797,7 @@ impl DeltaLayerInner { } } - Ok(Ok(DeltaLayerInner { + Ok(DeltaLayerInner { file, file_id, index_start_blk: actual_summary.index_start_blk, @@ -809,7 +805,7 @@ impl DeltaLayerInner { max_vectored_read_bytes, layer_key_range: actual_summary.key_range, layer_lsn_range: actual_summary.lsn_range, - })) + }) } pub(super) async fn get_value_reconstruct_data( diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 19e4e9e2e9..44ba685490 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -49,7 +49,6 @@ use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; @@ -228,7 +227,7 @@ impl ImageLayer { return Ok(()); } - let inner = self.load(LayerAccessKind::Dump, ctx).await?; + let inner = self.load(ctx).await?; inner.dump(ctx).await?; @@ -255,12 +254,8 @@ impl ImageLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - async fn load( - &self, - access_kind: LayerAccessKind, - ctx: &RequestContext, - ) -> Result<&ImageLayerInner> { - self.access_stats.record_access(access_kind, ctx); + async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> { + self.access_stats.record_access(ctx); self.inner .get_or_try_init(|| self.load_inner(ctx)) .await @@ -270,9 +265,8 @@ impl ImageLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); - let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx) - .await - .and_then(|res| res)?; + let loaded = + ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?; // not production code let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); @@ -312,7 +306,7 @@ impl ImageLayer { metadata.len(), ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: summary.lsn, - access_stats: LayerAccessStats::empty_will_record_residence_event_later(), + access_stats: Default::default(), inner: OnceCell::new(), }) } @@ -390,17 +384,16 @@ impl ImageLayerInner { summary: Option, max_vectored_read_bytes: Option, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path, ctx).await { - Ok(file) => file, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), - }; + ) -> anyhow::Result { + let file = VirtualFile::open(path, ctx) + .await + .context("open layer file")?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); - let summary_blk = match block_reader.read_blk(0, ctx).await { - Ok(blk) => blk, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), - }; + let summary_blk = block_reader + .read_blk(0, ctx) + .await + .context("read first block")?; // length is the only way how this could fail, so it's not actually likely at all unless // read_blk returns wrong sized block. @@ -425,7 +418,7 @@ impl ImageLayerInner { } } - Ok(Ok(ImageLayerInner { + Ok(ImageLayerInner { index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, lsn, @@ -433,7 +426,7 @@ impl ImageLayerInner { file_id, max_vectored_read_bytes, key_range: actual_summary.key_range, - })) + }) } pub(super) async fn get_value_reconstruct_data( diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 5941a52e98..f9010ae8a6 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -18,7 +18,7 @@ use anyhow::{anyhow, ensure, Result}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use std::collections::{BTreeMap, BinaryHeap, HashSet}; +use std::collections::BTreeMap; use std::sync::{Arc, OnceLock}; use std::time::Instant; use tracing::*; @@ -375,15 +375,6 @@ impl InMemoryLayer { let inner = self.inner.read().await; let reader = inner.file.block_cursor(); - #[derive(Eq, PartialEq, Ord, PartialOrd)] - struct BlockRead { - key: Key, - lsn: Lsn, - block_offset: u64, - } - - let mut planned_block_reads = BinaryHeap::new(); - for range in keyspace.ranges.iter() { for (key, vec_map) in inner.index.range(range.start..range.end) { let lsn_range = match reconstruct_state.get_cached_lsn(key) { @@ -392,49 +383,32 @@ impl InMemoryLayer { }; let slice = vec_map.slice_range(lsn_range); + for (entry_lsn, pos) in slice.iter().rev() { - planned_block_reads.push(BlockRead { - key: *key, - lsn: *entry_lsn, - block_offset: *pos, - }); + // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 + let buf = reader.read_blob(*pos, &ctx).await; + if let Err(e) = buf { + reconstruct_state + .on_key_error(*key, PageReconstructError::from(anyhow!(e))); + break; + } + + let value = Value::des(&buf.unwrap()); + if let Err(e) = value { + reconstruct_state + .on_key_error(*key, PageReconstructError::from(anyhow!(e))); + break; + } + + let key_situation = + reconstruct_state.update_key(key, *entry_lsn, value.unwrap()); + if key_situation == ValueReconstructSituation::Complete { + break; + } } } } - let keyspace_size = keyspace.total_raw_size(); - - let mut completed_keys = HashSet::new(); - while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() { - let block_read = planned_block_reads.pop().unwrap(); - if completed_keys.contains(&block_read.key) { - continue; - } - - // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 - let buf = reader.read_blob(block_read.block_offset, &ctx).await; - if let Err(e) = buf { - reconstruct_state - .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e))); - completed_keys.insert(block_read.key); - continue; - } - - let value = Value::des(&buf.unwrap()); - if let Err(e) = value { - reconstruct_state - .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e))); - completed_keys.insert(block_read.key); - continue; - } - - let key_situation = - reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap()); - if key_situation == ValueReconstructSituation::Complete { - completed_keys.insert(block_read.key); - } - } - reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn); Ok(()) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index d9cbaba529..1075feb1d1 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1,9 +1,7 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::{ - HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, -}; +use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; @@ -19,7 +17,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::repository::Key; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::task_mgr::TaskKind; -use crate::tenant::timeline::GetVectoredError; +use crate::tenant::timeline::{CompactionError, GetVectoredError}; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; @@ -160,13 +158,10 @@ impl Layer { metadata.file_size, ); - let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted); - let owner = Layer(Arc::new(LayerInner::new( conf, timeline, local_path, - access_stats, desc, None, metadata.generation, @@ -193,8 +188,6 @@ impl Layer { metadata.file_size, ); - let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident); - let mut resident = None; let owner = Layer(Arc::new_cyclic(|owner| { @@ -209,7 +202,6 @@ impl Layer { conf, timeline, local_path, - access_stats, desc, Some(inner), metadata.generation, @@ -245,11 +237,6 @@ impl Layer { version: 0, }); resident = Some(inner.clone()); - let access_stats = LayerAccessStats::empty_will_record_residence_event_later(); - access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::LayerCreate, - ); let local_path = local_layer_path( conf, @@ -259,16 +246,22 @@ impl Layer { &timeline.generation, ); - LayerInner::new( + let layer = LayerInner::new( conf, timeline, local_path, - access_stats, desc, Some(inner), timeline.generation, timeline.get_shard_index(), - ) + ); + + // Newly created layers are marked visible by default: the usual case is that they were created to be read. + layer + .access_stats + .set_visibility(super::LayerVisibilityHint::Visible); + + layer })); let downloaded = resident.expect("just initialized"); @@ -332,9 +325,7 @@ impl Layer { use anyhow::ensure; let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?; - self.0 - .access_stats - .record_access(LayerAccessKind::GetValueReconstructData, ctx); + self.0.access_stats.record_access(ctx); if self.layer_desc().is_delta { ensure!(lsn_range.start >= self.layer_desc().lsn_range.start); @@ -368,9 +359,7 @@ impl Layer { other => GetVectoredError::Other(anyhow::anyhow!(other)), })?; - self.0 - .access_stats - .record_access(LayerAccessKind::GetValueReconstructData, ctx); + self.0.access_stats.record_access(ctx); layer .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) @@ -437,7 +426,7 @@ impl Layer { } /// Downloads if necessary and creates a guard, which will keep this layer from being evicted. - pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result { + pub(crate) async fn download_and_keep_resident(&self) -> Result { let downloaded = self.0.get_or_maybe_download(true, None).await?; Ok(ResidentLayer { @@ -786,7 +775,6 @@ impl LayerInner { conf: &'static PageServerConf, timeline: &Arc, local_path: Utf8PathBuf, - access_stats: LayerAccessStats, desc: PersistentLayerDesc, downloaded: Option>, generation: Generation, @@ -821,7 +809,7 @@ impl LayerInner { path: local_path, desc, timeline: Arc::downgrade(timeline), - access_stats, + access_stats: Default::default(), wanted_deleted: AtomicBool::new(false), inner, version: AtomicUsize::new(version), @@ -1176,10 +1164,7 @@ impl LayerInner { LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); } - self.access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::ResidenceChange, - ); + self.access_stats.record_residence_event(); Ok(self.initialize_after_layer_is_on_disk(permit)) } @@ -1298,7 +1283,7 @@ impl LayerInner { lsn_end: lsn_range.end, remote: !resident, access_stats, - l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()), + l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range), } } else { let lsn = self.desc.image_layer_lsn(); @@ -1533,10 +1518,7 @@ impl LayerInner { } } - self.access_stats.record_residence_event( - LayerResidenceStatus::Evicted, - LayerResidenceEventReason::ResidenceChange, - ); + self.access_stats.record_residence_event(); self.status.as_ref().unwrap().send_replace(Status::Evicted); @@ -1669,8 +1651,9 @@ impl Drop for DownloadedLayer { } impl DownloadedLayer { - /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to - /// initialize it permanently. + /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`]. + /// Failure to load the layer is sticky, i.e., future `get()` calls will return + /// the initial load failure immediately. /// /// `owner` parameter is a strong reference at the same `LayerInner` as the /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called, @@ -1701,7 +1684,7 @@ impl DownloadedLayer { ctx, ) .await - .map(|res| res.map(LayerKind::Delta)) + .map(LayerKind::Delta) } else { let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( @@ -1718,32 +1701,29 @@ impl DownloadedLayer { ctx, ) .await - .map(|res| res.map(LayerKind::Image)) + .map(LayerKind::Image) }; match res { - Ok(Ok(layer)) => Ok(Ok(layer)), - Ok(Err(transient)) => Err(transient), - Err(permanent) => { + Ok(layer) => Ok(layer), + Err(err) => { LAYER_IMPL_METRICS.inc_permanent_loading_failures(); - // TODO(#5815): we are not logging all errors, so temporarily log them **once** - // here as well - let permanent = permanent.context("load layer"); - tracing::error!("layer loading failed permanently: {permanent:#}"); - Ok(Err(permanent)) + // We log this message once over the lifetime of `Self` + // => Ok and good to log backtrace and path here. + tracing::error!( + "layer load failed, assuming permanent failure: {}: {err:?}", + owner.path + ); + Err(err) } } }; self.kind - .get_or_try_init(init) - // return transient errors using `?` - .await? + .get_or_init(init) + .await .as_ref() - .map_err(|e| { - // errors are not clonabled, cannot but stringify - // test_broken_timeline matches this string - anyhow::anyhow!("layer loading failed: {e:#}") - }) + // We already logged the full backtrace above, once. Don't repeat that here. + .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}")) } async fn get_value_reconstruct_data( @@ -1778,7 +1758,11 @@ impl DownloadedLayer { ) -> Result<(), GetVectoredError> { use LayerKind::*; - match self.get(owner, ctx).await.map_err(GetVectoredError::from)? { + match self + .get(owner, ctx) + .await + .map_err(GetVectoredError::Other)? + { Delta(d) => { d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx) .await @@ -1862,9 +1846,7 @@ impl ResidentLayer { // this is valid because the DownloadedLayer::kind is a OnceCell, not a // Mutex, so we cannot go and deinitialize the value with OnceCell::take // while it's being held. - owner - .access_stats - .record_access(LayerAccessKind::KeyIter, ctx); + owner.access_stats.record_access(ctx); delta_layer::DeltaLayerInner::load_keys(d, ctx) .await @@ -1882,12 +1864,24 @@ impl ResidentLayer { shard_identity: &ShardIdentity, writer: &mut ImageLayerWriter, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { use LayerKind::*; - match self.downloaded.get(&self.owner.0, ctx).await? { - Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")), - Image(i) => i.filter(shard_identity, writer, ctx).await, + match self + .downloaded + .get(&self.owner.0, ctx) + .await + .map_err(CompactionError::Other)? + { + Delta(_) => { + return Err(CompactionError::Other(anyhow::anyhow!(format!( + "cannot filter() on a delta layer {self}" + )))); + } + Image(i) => i + .filter(shard_identity, writer, ctx) + .await + .map_err(CompactionError::Other), } } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 8a3737f8a7..423cde001c 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,3 +1,5 @@ +use std::time::UNIX_EPOCH; + use pageserver_api::key::CONTROLFILE_KEY; use tokio::task::JoinSet; use utils::{ @@ -7,7 +9,7 @@ use utils::{ use super::failpoints::{Failpoint, FailpointKind}; use super::*; -use crate::context::DownloadBehavior; +use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint}; use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; /// Used in tests to advance a future to wanted await point, and not futher. @@ -826,9 +828,9 @@ async fn eviction_cancellation_on_drop() { #[test] #[cfg(target_arch = "x86_64")] fn layer_size() { - assert_eq!(std::mem::size_of::(), 2040); - assert_eq!(std::mem::size_of::(), 104); - assert_eq!(std::mem::size_of::(), 2344); + assert_eq!(size_of::(), 8); + assert_eq!(size_of::(), 104); + assert_eq!(size_of::(), 312); // it also has the utf8 path } @@ -968,3 +970,46 @@ fn spawn_blocking_pool_helper_actually_works() { println!("joined"); }); } + +/// Drop the low bits from a time, to emulate the precision loss in LayerAccessStats +fn lowres_time(hires: SystemTime) -> SystemTime { + let ts = hires.duration_since(UNIX_EPOCH).unwrap().as_secs(); + UNIX_EPOCH + Duration::from_secs(ts) +} + +#[test] +fn access_stats() { + let access_stats = LayerAccessStats::default(); + // Default is visible + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible); + + access_stats.set_visibility(LayerVisibilityHint::Covered); + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered); + access_stats.set_visibility(LayerVisibilityHint::Visible); + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible); + + let rtime = UNIX_EPOCH + Duration::from_secs(2000000000); + access_stats.record_residence_event_at(rtime); + assert_eq!(access_stats.latest_activity(), lowres_time(rtime)); + + let atime = UNIX_EPOCH + Duration::from_secs(2100000000); + access_stats.record_access_at(atime); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); + + // Setting visibility doesn't clobber access time + access_stats.set_visibility(LayerVisibilityHint::Covered); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); + access_stats.set_visibility(LayerVisibilityHint::Visible); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); +} + +#[test] +fn access_stats_2038() { + // The access stats structure uses a timestamp representation that will run out + // of bits in 2038. One year before that, this unit test will start failing. + + let one_year_from_now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap() + + Duration::from_secs(3600 * 24 * 365); + + assert!(one_year_from_now.as_secs() < (2 << 31)); +} diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index da26e1eeb7..f33ca076ab 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -248,6 +248,14 @@ impl LayerName { Image(_) => "image", } } + + /// Gets the key range encoded in the layer name. + pub fn key_range(&self) -> &Range { + match &self { + LayerName::Image(layer) => &layer.key_range, + LayerName::Delta(layer) => &layer.key_range, + } + } } impl fmt::Display for LayerName { diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index eb4a1f28a1..b4bd976033 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -204,9 +204,11 @@ impl<'a> IteratorWrapper<'a> { /// A merge iterator over delta/image layer iterators. When duplicated records are /// found, the iterator will not perform any deduplication, and the caller should handle /// these situation. By saying duplicated records, there are many possibilities: +/// /// * Two same delta at the same LSN. /// * Two same image at the same LSN. /// * Delta/image at the same LSN where the image has already applied the delta. +/// /// The iterator will always put the image before the delta. pub struct MergeIterator<'a> { heap: BinaryHeap>, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index d679b78f32..230362d81a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -101,7 +101,6 @@ pub fn start_background_loops( Some(tenant_shard_id), None, &format!("compactor for tenant {tenant_shard_id}"), - false, { let tenant = Arc::clone(tenant); let background_jobs_can_start = background_jobs_can_start.cloned(); @@ -125,7 +124,6 @@ pub fn start_background_loops( Some(tenant_shard_id), None, &format!("garbage collector for tenant {tenant_shard_id}"), - false, { let tenant = Arc::clone(tenant); let background_jobs_can_start = background_jobs_can_start.cloned(); @@ -149,7 +147,6 @@ pub fn start_background_loops( Some(tenant_shard_id), None, &format!("ingest housekeeping for tenant {tenant_shard_id}"), - false, { let tenant = Arc::clone(tenant); let background_jobs_can_start = background_jobs_can_start.cloned(); @@ -213,24 +210,28 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { Duration::from_secs(10) } else { // Run compaction - if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - log_compaction_error( - &e, - error_run_count, - &wait_duration, - cancel.is_cancelled(), - ); - wait_duration - } else { - error_run_count = 0; - period + match tenant.compaction_iteration(&cancel, &ctx).await { + Err(e) => { + let wait_duration = backoff::exponential_backoff_duration_seconds( + error_run_count + 1, + 1.0, + MAX_BACKOFF_SECS, + ); + error_run_count += 1; + let wait_duration = Duration::from_secs_f64(wait_duration); + log_compaction_error( + &e, + error_run_count, + &wait_duration, + cancel.is_cancelled(), + ); + wait_duration + } + Ok(has_pending_task) => { + error_run_count = 0; + // schedule the next compaction immediately in case there is a pending compaction task + if has_pending_task { Duration::from_secs(0) } else { period } + } } }; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 19b1396981..862ca42188 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,5 +1,5 @@ pub(crate) mod analysis; -mod compaction; +pub(crate) mod compaction; pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; @@ -137,7 +137,7 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::config::TenantConf; +use super::{config::TenantConf, upload_queue::NotInitialized}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer}; @@ -460,7 +460,7 @@ pub(crate) struct GcInfo { /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub(crate) retain_lsns: Vec, + pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>, /// The cutoff coordinates, which are combined by selecting the minimum. pub(crate) cutoffs: GcCutoffs, @@ -476,12 +476,21 @@ impl GcInfo { pub(crate) fn min_cutoff(&self) -> Lsn { self.cutoffs.select_min() } + + pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) { + self.retain_lsns.push((child_lsn, child_id)); + self.retain_lsns.sort_by_key(|i| i.0); + } + + pub(super) fn remove_child(&mut self, child_id: TimelineId) { + self.retain_lsns.retain(|i| i.1 != child_id); + } } /// The `GcInfo` component describing which Lsns need to be retained. Functionally, this /// is a single number (the oldest LSN which we must retain), but it internally distinguishes /// between time-based and space-based retention for observability and consumption metrics purposes. -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct GcCutoffs { /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much /// history we must keep to retain a specified number of bytes of WAL. @@ -633,7 +642,13 @@ impl FlushLayerError { // When crossing from generic anyhow errors to this error type, we explicitly check // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err. fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self { - if timeline.cancel.is_cancelled() { + let cancelled = timeline.cancel.is_cancelled() + // The upload queue might have been shut down before the official cancellation of the timeline. + || err + .downcast_ref::() + .map(NotInitialized::is_stopping) + .unwrap_or_default(); + if cancelled { Self::Cancelled } else { Self::Other(Arc::new(err)) @@ -1754,13 +1769,14 @@ impl Timeline { } } - /// Outermost timeline compaction operation; downloads needed layers. + /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending + /// compaction tasks. pub(crate) async fn compact( self: &Arc, cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { // most likely the cancellation token is from background task, but in tests it could be the // request task as well. @@ -1780,8 +1796,8 @@ impl Timeline { // compaction task goes over it's period (20s) which is quite often in production. let (_guard, _permit) = tokio::select! { tuple = prepare => { tuple }, - _ = self.cancel.cancelled() => return Ok(()), - _ = cancel.cancelled() => return Ok(()), + _ = self.cancel.cancelled() => return Ok(false), + _ = cancel.cancelled() => return Ok(false), }; let last_record_lsn = self.get_last_record_lsn(); @@ -1789,11 +1805,14 @@ impl Timeline { // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); - return Ok(()); + return Ok(false); } match self.get_compaction_algorithm_settings().kind { - CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await, + CompactionAlgorithm::Tiered => { + self.compact_tiered(cancel, ctx).await?; + Ok(false) + } CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await, } } @@ -1982,6 +2001,11 @@ impl Timeline { self.current_state() == TimelineState::Active } + #[allow(unused)] + pub(crate) fn is_archived(&self) -> Option { + self.remote_client.is_archived() + } + pub(crate) fn is_stopping(&self) -> bool { self.current_state() == TimelineState::Stopping } @@ -2307,6 +2331,11 @@ impl Timeline { ) }; + if let Some(ancestor) = &ancestor { + let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); + ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn()); + } + Arc::new_cyclic(|myself| { let metrics = TimelineMetrics::new( &tenant_shard_id, @@ -2477,7 +2506,6 @@ impl Timeline { Some(self.tenant_shard_id), Some(self.timeline_id), "layer flush task", - false, async move { let _guard = guard; let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); @@ -2822,7 +2850,6 @@ impl Timeline { Some(self.tenant_shard_id), Some(self.timeline_id), "initial size calculation", - false, // NB: don't log errors here, task_mgr will do that. async move { let cancel = task_mgr::shutdown_token(); @@ -2991,7 +3018,6 @@ impl Timeline { Some(self.tenant_shard_id), Some(self.timeline_id), "ondemand logical size calculation", - false, async move { let res = self_clone .logical_size_calculation_task(lsn, cause, &ctx) @@ -3158,7 +3184,7 @@ impl Timeline { let guard = self.layers.read().await; let resident = guard.likely_resident_layers().map(|layer| { - let last_activity_ts = layer.access_stats().latest_activity_or_now(); + let last_activity_ts = layer.access_stats().latest_activity(); HeatMapLayer::new( layer.layer_desc().layer_name(), @@ -3404,7 +3430,6 @@ impl Timeline { } } - #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace /// @@ -4756,6 +4781,18 @@ impl Timeline { } } +impl Drop for Timeline { + fn drop(&mut self) { + if let Some(ancestor) = &self.ancestor_timeline { + // This lock should never be poisoned, but in case it is we do a .map() instead of + // an unwrap(), to avoid panicking in a destructor and thereby aborting the process. + if let Ok(mut gc_info) = ancestor.gc_info.write() { + gc_info.remove_child(self.timeline_id) + } + } + } +} + /// Top-level failure to compact. #[derive(Debug, thiserror::Error)] pub(crate) enum CompactionError { @@ -4763,7 +4800,7 @@ pub(crate) enum CompactionError { ShuttingDown, /// Compaction cannot be done right now; page reconstruction and so on. #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), } impl From for CompactionError { @@ -4778,6 +4815,38 @@ impl From for CompactionError { } } +impl From for CompactionError { + fn from(value: super::upload_queue::NotInitialized) -> Self { + match value { + super::upload_queue::NotInitialized::Uninitialized + | super::upload_queue::NotInitialized::Stopped => { + CompactionError::Other(anyhow::anyhow!(value)) + } + super::upload_queue::NotInitialized::ShuttingDown => CompactionError::ShuttingDown, + } + } +} + +impl CompactionError { + /// We cannot do compaction because we could not download a layer that is input to the compaction. + pub(crate) fn input_layer_download_failed( + e: super::storage_layer::layer::DownloadError, + ) -> Self { + match e { + super::storage_layer::layer::DownloadError::TimelineShutdown | + /* TODO DownloadCancelled correct here? */ + super::storage_layer::layer::DownloadError::DownloadCancelled => CompactionError::ShuttingDown, + super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads | + super::storage_layer::layer::DownloadError::DownloadRequired | + super::storage_layer::layer::DownloadError::NotFile(_) | + super::storage_layer::layer::DownloadError::DownloadFailed | + super::storage_layer::layer::DownloadError::PreStatFailed(_)=>CompactionError::Other(anyhow::anyhow!(e)), + #[cfg(test)] + super::storage_layer::layer::DownloadError::Failpoint(_) => CompactionError::Other(anyhow::anyhow!(e)), + } + } +} + #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); @@ -4851,7 +4920,7 @@ impl Timeline { new_deltas: &[ResidentLayer], new_images: &[ResidentLayer], layers_to_remove: &[Layer], - ) -> anyhow::Result<()> { + ) -> Result<(), CompactionError> { let mut guard = self.layers.write().await; let mut duplicated_layers = HashSet::new(); @@ -4868,8 +4937,8 @@ impl Timeline { // for compact_level0_phase1 creating an L0, which does not happen in practice // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); - } else if LayerMap::is_l0(l.layer_desc()) { - bail!("compaction generates a L0 layer file as output, which will cause infinite compaction."); + } else if LayerMap::is_l0(&l.layer_desc().key_range) { + return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); } else { insert_layers.push(l.clone()); } @@ -4901,7 +4970,7 @@ impl Timeline { self: &Arc, mut replace_layers: Vec<(Layer, ResidentLayer)>, mut drop_layers: Vec, - ) -> anyhow::Result<()> { + ) -> Result<(), super::upload_queue::NotInitialized> { let mut guard = self.layers.write().await; // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want @@ -4923,7 +4992,7 @@ impl Timeline { fn upload_new_image_layers( self: &Arc, new_images: impl IntoIterator, - ) -> anyhow::Result<()> { + ) -> Result<(), super::upload_queue::NotInitialized> { for layer in new_images { self.remote_client.schedule_layer_file_upload(layer)?; } @@ -5073,7 +5142,11 @@ impl Timeline { let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn()); let time_cutoff = gc_info.cutoffs.time; - let retain_lsns = gc_info.retain_lsns.clone(); + let retain_lsns = gc_info + .retain_lsns + .iter() + .map(|(lsn, _child_id)| *lsn) + .collect(); // Gets the maximum LSN that holds the valid lease. // @@ -5435,7 +5508,6 @@ impl Timeline { Some(self.tenant_shard_id), Some(self.timeline_id), "download all remote layers task", - false, async move { self_clone.download_all_remote_layers(request).await; let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); @@ -5586,7 +5658,7 @@ impl Timeline { let file_size = layer.layer_desc().file_size; max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); - let last_activity_ts = layer.access_stats().latest_activity_or_now(); + let last_activity_ts = layer.access_stats().latest_activity(); EvictionCandidate { layer: layer.into(), diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index a648432b4d..497d631f4f 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -27,8 +27,9 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD}; +use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::merge_iterator::MergeIterator; -use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc}; +use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState}; use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome}; use crate::tenant::timeline::{Layer, ResidentLayer}; @@ -36,7 +37,7 @@ use crate::tenant::DeltaLayer; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use crate::keyspace::KeySpace; -use crate::repository::Key; +use crate::repository::{Key, Value}; use utils::lsn::Lsn; @@ -45,16 +46,75 @@ use pageserver_compaction::interface::*; use super::CompactionError; +/// Maximum number of deltas before generating an image layer in bottom-most compaction. +const COMPACTION_DELTA_THRESHOLD: usize = 5; + +/// The result of bottom-most compaction for a single key at each LSN. +#[derive(Debug)] +#[cfg_attr(test, derive(PartialEq))] +pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>); + +/// The result of bottom-most compaction. +#[derive(Debug)] +#[cfg_attr(test, derive(PartialEq))] +pub(crate) struct KeyHistoryRetention { + /// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN. + pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>, + /// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN. + pub(crate) above_horizon: KeyLogAtLsn, +} + +impl KeyHistoryRetention { + async fn pipe_to( + self, + key: Key, + delta_writer: &mut Vec<(Key, Lsn, Value)>, + image_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let mut first_batch = true; + for (_, KeyLogAtLsn(logs)) in self.below_horizon { + if first_batch { + if logs.len() == 1 && logs[0].1.is_image() { + let Value::Image(img) = &logs[0].1 else { + unreachable!() + }; + image_writer.put_image(key, img.clone(), ctx).await?; + } else { + for (lsn, val) in logs { + delta_writer.push((key, lsn, val)); + } + } + first_batch = false; + } else { + for (lsn, val) in logs { + delta_writer.push((key, lsn, val)); + } + } + } + let KeyLogAtLsn(above_horizon_logs) = self.above_horizon; + for (lsn, val) in above_horizon_logs { + delta_writer.push((key, lsn, val)); + } + Ok(()) + } +} + impl Timeline { /// TODO: cancellation + /// + /// Returns whether the compaction has pending tasks. pub(crate) async fn compact_legacy( self: &Arc, cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) { - return self.compact_with_gc(cancel, ctx).await; + self.compact_with_gc(cancel, ctx) + .await + .map_err(CompactionError::Other)?; + return Ok(false); } // High level strategy for compaction / image creation: @@ -102,7 +162,7 @@ impl Timeline { // Define partitioning schema if needed // FIXME: the match should only cover repartitioning, not the next steps - let partition_count = match self + let (partition_count, has_pending_tasks) = match self .repartition( self.get_last_record_lsn(), self.get_compaction_target_size(), @@ -119,30 +179,35 @@ impl Timeline { // 2. Compact let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(target_file_size, ctx).await?; + let fully_compacted = self.compact_level0(target_file_size, ctx).await?; timer.stop_and_record(); - // 3. Create new image layers for partitions that have been modified - // "enough". let mut partitioning = dense_partitioning; partitioning .parts .extend(sparse_partitioning.into_dense().parts); - let image_layers = self - .create_image_layers( - &partitioning, - lsn, - if flags.contains(CompactFlags::ForceImageLayerCreation) { - ImageLayerCreationMode::Force - } else { - ImageLayerCreationMode::Try - }, - &image_ctx, - ) - .await?; - self.upload_new_image_layers(image_layers)?; - partitioning.parts.len() + // 3. Create new image layers for partitions that have been modified + // "enough". Skip image layer creation if L0 compaction cannot keep up. + if fully_compacted { + let image_layers = self + .create_image_layers( + &partitioning, + lsn, + if flags.contains(CompactFlags::ForceImageLayerCreation) { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, + &image_ctx, + ) + .await?; + + self.upload_new_image_layers(image_layers)?; + } else { + info!("skipping image layer generation due to L0 compaction did not include all layers."); + } + (partitioning.parts.len(), !fully_compacted) } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -154,7 +219,7 @@ impl Timeline { if !self.cancel.is_cancelled() { tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); } - 1 + (1, false) } }; @@ -167,7 +232,7 @@ impl Timeline { self.compact_shard_ancestors(rewrite_max, ctx).await?; } - Ok(()) + Ok(has_pending_tasks) } /// Check for layers that are elegible to be rewritten: @@ -182,7 +247,7 @@ impl Timeline { self: &Arc, rewrite_max: usize, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), CompactionError> { let mut drop_layers = Vec::new(); let mut layers_to_rewrite: Vec = Vec::new(); @@ -303,7 +368,8 @@ impl Timeline { layer.layer_desc().image_layer_lsn(), ctx, ) - .await?; + .await + .map_err(CompactionError::Other)?; // Safety of layer rewrites: // - We are writing to a different local file path than we are reading from, so the old Layer @@ -318,14 +384,20 @@ impl Timeline { // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: // - GC, which at worst witnesses us "undelete" a layer that they just deleted. // - ingestion, which only inserts layers, therefore cannot collide with us. - let resident = layer.download_and_keep_resident().await?; + let resident = layer + .download_and_keep_resident() + .await + .map_err(CompactionError::input_layer_download_failed)?; let keys_written = resident .filter(&self.shard_identity, &mut image_layer_writer, ctx) .await?; if keys_written > 0 { - let new_layer = image_layer_writer.finish(self, ctx).await?; + let new_layer = image_layer_writer + .finish(self, ctx) + .await + .map_err(CompactionError::Other)?; tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", layer.metadata().file_size, new_layer.metadata().file_size); @@ -353,7 +425,13 @@ impl Timeline { // necessary for correctness, but it simplifies testing, and avoids proceeding with another // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O // load. - self.remote_client.wait_completion().await?; + match self.remote_client.wait_completion().await { + Ok(()) => (), + Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), + Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { + return Err(CompactionError::ShuttingDown) + } + } fail::fail_point!("compact-shard-ancestors-persistent"); @@ -361,15 +439,16 @@ impl Timeline { } /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as - /// as Level 1 files. + /// as Level 1 files. Returns whether the L0 layers are fully compacted. async fn compact_level0( self: &Arc, target_file_size: u64, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { let CompactLevel0Phase1Result { new_layers, deltas_to_compact, + fully_compacted, } = { let phase1_span = info_span!("compact_level0_phase1"); let ctx = ctx.attached_child(); @@ -392,12 +471,12 @@ impl Timeline { if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do - return Ok(()); + return Ok(true); } self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact) .await?; - Ok(()) + Ok(fully_compacted) } /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. @@ -411,7 +490,7 @@ impl Timeline { stats.read_lock_held_spawn_blocking_startup_micros = stats.read_lock_acquisition_micros.till_now(); // set by caller let layers = guard.layer_map(); - let level0_deltas = layers.get_level0_deltas()?; + let level0_deltas = layers.get_level0_deltas(); let mut level0_deltas = level0_deltas .into_iter() .map(|x| guard.get_from_desc(&x)) @@ -464,14 +543,25 @@ impl Timeline { ) as u64 * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE); - deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); + let mut fully_compacted = true; + + deltas_to_compact.push( + first_level0_delta + .download_and_keep_resident() + .await + .map_err(CompactionError::input_layer_download_failed)?, + ); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; if lsn_range.start != prev_lsn_end { break; } - deltas_to_compact.push(l.download_and_keep_resident().await?); + deltas_to_compact.push( + l.download_and_keep_resident() + .await + .map_err(CompactionError::input_layer_download_failed)?, + ); deltas_to_compact_bytes += l.metadata().file_size; prev_lsn_end = lsn_range.end; @@ -482,6 +572,7 @@ impl Timeline { "L0 compaction picker hit max delta layer size limit: {}", delta_size_limit ); + fully_compacted = false; // Proceed with compaction, but only a subset of L0s break; @@ -530,7 +621,7 @@ impl Timeline { let mut all_keys = Vec::new(); for l in deltas_to_compact.iter() { - all_keys.extend(l.load_keys(ctx).await?); + all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?); } // FIXME: should spawn_blocking the rest of this function @@ -652,7 +743,7 @@ impl Timeline { key, lsn, ref val, .. } in all_values_iter { - let value = val.load(ctx).await?; + let value = val.load(ctx).await.map_err(CompactionError::Other)?; let same_key = prev_key.map_or(false, |prev_key| prev_key == key); // We need to check key boundaries once we reach next key or end of layer with the same key if !same_key || lsn == dup_end_lsn { @@ -709,7 +800,8 @@ impl Timeline { .take() .unwrap() .finish(prev_key.unwrap().next(), self, ctx) - .await?, + .await + .map_err(CompactionError::Other)?, ); writer = None; @@ -747,7 +839,8 @@ impl Timeline { }, ctx, ) - .await?, + .await + .map_err(CompactionError::Other)?, ); } @@ -755,7 +848,8 @@ impl Timeline { .as_mut() .unwrap() .put_value(key, lsn, value, ctx) - .await?; + .await + .map_err(CompactionError::Other)?; } else { debug!( "Dropping key {} during compaction (it belongs on shard {:?})", @@ -771,7 +865,12 @@ impl Timeline { prev_key = Some(key); } if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?); + new_layers.push( + writer + .finish(prev_key.unwrap().next(), self, ctx) + .await + .map_err(CompactionError::Other)?, + ); } // Sync layers @@ -835,6 +934,7 @@ impl Timeline { .into_iter() .map(|x| x.drop_eviction_guard()) .collect::>(), + fully_compacted, }) } } @@ -843,6 +943,9 @@ impl Timeline { struct CompactLevel0Phase1Result { new_layers: Vec, deltas_to_compact: Vec, + // Whether we have included all L0 layers, or selected only part of them due to the + // L0 compaction size limit. + fully_compacted: bool, } #[derive(Default)] @@ -953,7 +1056,7 @@ impl Timeline { let guard = self.layers.read().await; let layers = guard.layer_map(); - let l0_deltas = layers.get_level0_deltas()?; + let l0_deltas = layers.get_level0_deltas(); drop(guard); // As an optimization, if we find that there are too few L0 layers, @@ -983,12 +1086,197 @@ impl Timeline { fanout, ctx, ) - .await?; + .await + // TODO: compact_tiered needs to return CompactionError + .map_err(CompactionError::Other)?; adaptor.flush_updates().await?; Ok(()) } + /// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns. + /// + /// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon. + /// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is + /// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch. + /// + /// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have: + /// + /// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60 + /// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3 + /// + /// The function will produce: + /// + /// ```plain + /// 0x20(retain_lsn) -> img=AB@0x20 always produce a single image below the lowest retain LSN + /// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40] two deltas since the last base image, keeping the deltas + /// 0x50(horizon) -> deltas=[ABCDE@0x50] three deltas since the last base image, generate an image but put it in the delta + /// above_horizon -> deltas=[+F@0x60] full history above the horizon + /// ``` + /// + /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key. + pub(crate) async fn generate_key_retention( + self: &Arc, + key: Key, + history: &[(Key, Lsn, Value)], + horizon: Lsn, + retain_lsn_below_horizon: &[Lsn], + delta_threshold_cnt: usize, + ) -> anyhow::Result { + // Pre-checks for the invariants + if cfg!(debug_assertions) { + for (log_key, _, _) in history { + assert_eq!(log_key, &key, "mismatched key"); + } + for i in 1..history.len() { + assert!(history[i - 1].1 <= history[i].1, "unordered LSN"); + if history[i - 1].1 == history[i].1 { + assert!( + matches!(history[i - 1].2, Value::Image(_)), + "unordered delta/image, or duplicated delta" + ); + } + } + // There was an assertion for no base image that checks if the first + // record in the history is `will_init` before, but it was removed. + // This is explained in the test cases for generate_key_retention. + // Search "incomplete history" for more information. + for lsn in retain_lsn_below_horizon { + assert!(lsn < &horizon, "retain lsn must be below horizon") + } + for i in 1..retain_lsn_below_horizon.len() { + assert!( + retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i], + "unordered LSN" + ); + } + } + // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon, + // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket. + let (mut split_history, lsn_split_points) = { + let mut split_history = Vec::new(); + split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new); + let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1); + for lsn in retain_lsn_below_horizon { + lsn_split_points.push(*lsn); + } + lsn_split_points.push(horizon); + let mut current_idx = 0; + for item @ (_, lsn, _) in history { + while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] { + current_idx += 1; + } + split_history[current_idx].push(item); + } + (split_history, lsn_split_points) + }; + // Step 2: filter out duplicated records due to the k-merge of image/delta layers + for split_for_lsn in &mut split_history { + let mut prev_lsn = None; + let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len()); + for record @ (_, lsn, _) in std::mem::take(split_for_lsn) { + if let Some(prev_lsn) = &prev_lsn { + if *prev_lsn == lsn { + // The case that we have an LSN with both data from the delta layer and the image layer. As + // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply + // drop this delta and keep the image. + // + // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will + // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply + // dropped. + continue; + } + } + prev_lsn = Some(lsn); + new_split_for_lsn.push(record); + } + *split_for_lsn = new_split_for_lsn; + } + // Step 3: generate images when necessary + let mut retention = Vec::with_capacity(split_history.len()); + let mut records_since_last_image = 0; + let batch_cnt = split_history.len(); + assert!( + batch_cnt >= 2, + "should have at least below + above horizon batches" + ); + let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new(); + for (i, split_for_lsn) in split_history.into_iter().enumerate() { + records_since_last_image += split_for_lsn.len(); + let generate_image = if i == 0 { + // We always generate images for the first batch (below horizon / lowest retain_lsn) + true + } else if i == batch_cnt - 1 { + // Do not generate images for the last batch (above horizon) + false + } else if records_since_last_image >= delta_threshold_cnt { + // Generate images when there are too many records + true + } else { + false + }; + replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone())); + // Only retain the items after the last image record + for idx in (0..replay_history.len()).rev() { + if replay_history[idx].2.will_init() { + replay_history = replay_history[idx..].to_vec(); + break; + } + } + if let Some((_, _, val)) = replay_history.first() { + assert!(val.will_init(), "invalid history, no base image"); + } + if generate_image && records_since_last_image > 0 { + records_since_last_image = 0; + let history = std::mem::take(&mut replay_history); + let mut img = None; + let mut records = Vec::with_capacity(history.len()); + if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() { + img = Some((*lsn, val.clone())); + for (_, lsn, val) in history.into_iter().skip(1) { + let Value::WalRecord(rec) = val else { + panic!("invalid record") + }; + records.push((lsn, rec)); + } + } else { + for (_, lsn, val) in history.into_iter() { + let Value::WalRecord(rec) = val else { + panic!("invalid record") + }; + records.push((lsn, rec)); + } + } + records.reverse(); + let state = ValueReconstructState { img, records }; + let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range + let img = self.reconstruct_value(key, request_lsn, state).await?; + replay_history.push((key, request_lsn, Value::Image(img.clone()))); + retention.push(vec![(request_lsn, Value::Image(img))]); + } else { + retention.push( + split_for_lsn + .iter() + .map(|(_, lsn, value)| (*lsn, value.clone())) + .collect(), + ); + } + } + let mut result = Vec::with_capacity(retention.len()); + assert_eq!(retention.len(), lsn_split_points.len() + 1); + for (idx, logs) in retention.into_iter().enumerate() { + if idx == lsn_split_points.len() { + return Ok(KeyHistoryRetention { + below_horizon: result, + above_horizon: KeyLogAtLsn(logs), + }); + } else { + result.push((lsn_split_points[idx], KeyLogAtLsn(logs))); + } + } + unreachable!() + } + /// An experimental compaction building block that combines compaction with garbage collection. /// /// The current implementation picks all delta + image layers that are below or intersecting with @@ -999,8 +1287,7 @@ impl Timeline { self: &Arc, _cancel: &CancellationToken, ctx: &RequestContext, - ) -> Result<(), CompactionError> { - use crate::tenant::storage_layer::ValueReconstructState; + ) -> anyhow::Result<()> { use std::collections::BTreeSet; info!("running enhanced gc bottom-most compaction"); @@ -1013,30 +1300,51 @@ impl Timeline { // The layer selection has the following properties: // 1. If a layer is in the selection, all layers below it are in the selection. // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. - let (layer_selection, gc_cutoff) = { + let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = { let guard = self.layers.read().await; let layers = guard.layer_map(); let gc_info = self.gc_info.read().unwrap(); - if !gc_info.retain_lsns.is_empty() || !gc_info.leases.is_empty() { - return Err(CompactionError::Other(anyhow!( - "enhanced legacy compaction currently does not support retain_lsns (branches)" - ))); - } + let mut retain_lsns_below_horizon = Vec::new(); let gc_cutoff = gc_info.cutoffs.select_min(); + for (lsn, _timeline_id) in &gc_info.retain_lsns { + if lsn < &gc_cutoff { + retain_lsns_below_horizon.push(*lsn); + } + } + for lsn in gc_info.leases.keys() { + if lsn < &gc_cutoff { + retain_lsns_below_horizon.push(*lsn); + } + } let mut selected_layers = Vec::new(); - // TODO: consider retain_lsns drop(gc_info); for desc in layers.iter_historic_layers() { if desc.get_lsn_range().start <= gc_cutoff { selected_layers.push(guard.get_from_desc(&desc)); } } - (selected_layers, gc_cutoff) + retain_lsns_below_horizon.sort(); + (selected_layers, gc_cutoff, retain_lsns_below_horizon) }; + let lowest_retain_lsn = retain_lsns_below_horizon + .first() + .copied() + .unwrap_or(gc_cutoff); + if cfg!(debug_assertions) { + assert_eq!( + lowest_retain_lsn, + retain_lsns_below_horizon + .iter() + .min() + .copied() + .unwrap_or(gc_cutoff) + ); + } info!( - "picked {} layers for compaction with gc_cutoff={}", + "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}", layer_selection.len(), - gc_cutoff + gc_cutoff, + lowest_retain_lsn ); // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. // Also, collect the layer information to decide when to split the new delta layers. @@ -1072,61 +1380,13 @@ impl Timeline { let mut accumulated_values = Vec::new(); let mut last_key: Option = None; - /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon. - async fn flush_accumulated_states( - tline: &Arc, - key: Key, - accumulated_values: &[(Key, Lsn, crate::repository::Value)], - horizon: Lsn, - ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> { - let mut base_image = None; - let mut keys_above_horizon = Vec::new(); - let mut delta_above_base_image = Vec::new(); - // We have a list of deltas/images. We want to create image layers while collect garbages. - for (key, lsn, val) in accumulated_values.iter().rev() { - if *lsn > horizon { - if let Some((_, prev_lsn, _)) = keys_above_horizon.last_mut() { - if *prev_lsn == *lsn { - // The case that we have an LSN with both data from the delta layer and the image layer. As - // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply - // drop this delta and keep the image. - // - // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will - // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply - // dropped. - continue; - } - } - keys_above_horizon.push((*key, *lsn, val.clone())); - } else if *lsn <= horizon { - match val { - crate::repository::Value::Image(image) => { - base_image = Some((*lsn, image.clone())); - break; - } - crate::repository::Value::WalRecord(wal) => { - delta_above_base_image.push((*lsn, wal.clone())); - } - } - } - } - // do not reverse delta_above_base_image, reconstruct state expects reversely-ordered records - keys_above_horizon.reverse(); - let state = ValueReconstructState { - img: base_image, - records: delta_above_base_image, - }; - let img = tline.reconstruct_value(key, horizon, state).await?; - Ok((keys_above_horizon, img)) - } - async fn flush_deltas( deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>, last_key: Key, delta_split_points: &[Key], current_delta_split_point: &mut usize, tline: &Arc, - gc_cutoff: Lsn, + lowest_retain_lsn: Lsn, ctx: &RequestContext, ) -> anyhow::Result> { // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid @@ -1161,7 +1421,7 @@ impl Timeline { tline.timeline_id, tline.tenant_shard_id, deltas.first().unwrap().0, - gc_cutoff..end_lsn, + lowest_retain_lsn..end_lsn, ctx, ) .await?; @@ -1178,7 +1438,7 @@ impl Timeline { self.timeline_id, self.tenant_shard_id, &(Key::MIN..Key::MAX), // covers the full key range - gc_cutoff, + lowest_retain_lsn, ctx, ) .await?; @@ -1195,12 +1455,19 @@ impl Timeline { accumulated_values.push((key, lsn, val)); } else { let last_key = last_key.as_mut().unwrap(); - let (deltas, image) = - flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff) - .await?; + let retention = self + .generate_key_retention( + *last_key, + &accumulated_values, + gc_cutoff, + &retain_lsns_below_horizon, + COMPACTION_DELTA_THRESHOLD, + ) + .await?; // Put the image into the image layer. Currently we have a single big layer for the compaction. - image_layer_writer.put_image(*last_key, image, ctx).await?; - delta_values.extend(deltas); + retention + .pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx) + .await?; delta_layers.extend( flush_deltas( &mut delta_values, @@ -1208,7 +1475,7 @@ impl Timeline { &delta_split_points, &mut current_delta_split_point, self, - gc_cutoff, + lowest_retain_lsn, ctx, ) .await?, @@ -1221,11 +1488,19 @@ impl Timeline { let last_key = last_key.expect("no keys produced during compaction"); // TODO: move this part to the loop body - let (deltas, image) = - flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?; + let retention = self + .generate_key_retention( + last_key, + &accumulated_values, + gc_cutoff, + &retain_lsns_below_horizon, + COMPACTION_DELTA_THRESHOLD, + ) + .await?; // Put the image into the image layer. Currently we have a single big layer for the compaction. - image_layer_writer.put_image(last_key, image, ctx).await?; - delta_values.extend(deltas); + retention + .pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx) + .await?; delta_layers.extend( flush_deltas( &mut delta_values, @@ -1233,7 +1508,7 @@ impl Timeline { &delta_split_points, &mut current_delta_split_point, self, - gc_cutoff, + lowest_retain_lsn, ctx, ) .await?, @@ -1281,7 +1556,7 @@ impl TimelineAdaptor { } } - pub async fn flush_updates(&mut self) -> anyhow::Result<()> { + pub async fn flush_updates(&mut self) -> Result<(), CompactionError> { let layers_to_delete = { let guard = self.timeline.layers.read().await; self.layers_to_delete diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index d32945d9e4..ab6a5f20ba 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -148,14 +148,14 @@ async fn cleanup_remaining_timeline_fs_traces( /// For more context see comments in [`DeleteTimelineFlow::prepare`] async fn remove_timeline_from_tenant( tenant: &Tenant, - timeline_id: TimelineId, + timeline: &Timeline, _: &DeletionGuard, // using it as a witness ) -> anyhow::Result<()> { // Remove the timeline from the map. let mut timelines = tenant.timelines.lock().unwrap(); let children_exist = timelines .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id)); // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. // We already deleted the layer files, so it's probably best to panic. // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) @@ -164,7 +164,7 @@ async fn remove_timeline_from_tenant( } timelines - .remove(&timeline_id) + .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); drop(timelines); @@ -391,7 +391,6 @@ impl DeleteTimelineFlow { Some(tenant_shard_id), Some(timeline_id), "timeline_delete", - false, async move { if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await { error!("Error: {err:#}"); @@ -415,7 +414,7 @@ impl DeleteTimelineFlow { pausable_failpoint!("in_progress_delete"); - remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; + remove_timeline_from_tenant(tenant, timeline, &guard).await?; *guard = Self::Finished; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 49ce3db3e6..ee5f8cd52a 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -26,7 +26,7 @@ pub(crate) enum Error { #[error("flushing failed")] FlushAncestor(#[source] FlushLayerError), #[error("layer download failed")] - RewrittenDeltaDownloadFailed(#[source] anyhow::Error), + RewrittenDeltaDownloadFailed(#[source] crate::tenant::storage_layer::layer::DownloadError), #[error("copying LSN prefix locally failed")] CopyDeltaPrefix(#[source] anyhow::Error), #[error("upload rewritten layer")] diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 8a8c38d0ce..fec66aabc1 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -65,7 +65,6 @@ impl Timeline { "layer eviction for {}/{}", self.tenant_shard_id, self.timeline_id ), - false, async move { tokio::select! { _ = self_clone.cancel.cancelled() => { return Ok(()); } @@ -226,7 +225,7 @@ impl Timeline { continue; } - let last_activity_ts = layer.access_stats().latest_activity_or_now(); + let last_activity_ts = layer.access_stats().latest_activity(); let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index a43ff873ac..1e4edd34ad 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -255,6 +255,14 @@ impl LayerManager { new_layer.layer_desc().lsn_range ); + // Transfer visibilty hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to + // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents + // always marking rewritten layers as visible. + new_layer + .as_ref() + .access_stats() + .set_visibility(old_layer.access_stats().visibility()); + // Safety: we may never rewrite the same file in-place. Callers are responsible // for ensuring that they only rewrite layers after something changes the path, // such as an increment in the generation number. diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index f7440ecdae..592f41cb21 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -130,7 +130,7 @@ pub(super) enum UploadQueueStopped { } #[derive(thiserror::Error, Debug)] -pub(crate) enum NotInitialized { +pub enum NotInitialized { #[error("queue is in state Uninitialized")] Uninitialized, #[error("queue is in state Stopped")] diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index dff3a8f52d..804c7fca97 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -618,7 +618,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -685,7 +685,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -752,7 +752,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -920,7 +920,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 3197a7e715..d107cdc1c2 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -597,7 +597,7 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); - WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitLogicalReplicationMonitor(); diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index 0f76514b86..b575712dbe 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -220,7 +220,8 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou return NEON_WALREAD_ERROR; } /* we'll poll immediately */ - state->rem_state = RS_CONNECTING_READ; + state->rem_state = RS_CONNECTING_WRITE; + return NEON_WALREAD_WOULDBLOCK; } if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE) diff --git a/pgxn/neon_test_utils/neon_test_utils--1.3.sql b/pgxn/neon_test_utils/neon_test_utils--1.3.sql index 3b8794a8cf..9a9b41c3a3 100644 --- a/pgxn/neon_test_utils/neon_test_utils--1.3.sql +++ b/pgxn/neon_test_utils/neon_test_utils--1.3.sql @@ -7,6 +7,12 @@ AS 'MODULE_PATHNAME', 'test_consume_xids' LANGUAGE C STRICT PARALLEL UNSAFE; +CREATE FUNCTION test_consume_oids(oid int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_oids' +LANGUAGE C STRICT +PARALLEL UNSAFE; + CREATE FUNCTION test_consume_cpu(seconds int) RETURNS VOID AS 'MODULE_PATHNAME', 'test_consume_cpu' diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 650ef7405d..0b5499ca53 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -35,6 +35,7 @@ PG_MODULE_MAGIC; extern void _PG_init(void); PG_FUNCTION_INFO_V1(test_consume_xids); +PG_FUNCTION_INFO_V1(test_consume_oids); PG_FUNCTION_INFO_V1(test_consume_cpu); PG_FUNCTION_INFO_V1(test_consume_memory); PG_FUNCTION_INFO_V1(test_release_memory); @@ -74,6 +75,21 @@ _PG_init(void) #define neon_read_at_lsn neon_read_at_lsn_ptr +/* + * test_consume_oids(int4), for rapidly consuming OIDs, to test wraparound. + * Unlike test_consume_xids which is passed number of xids to be consumed, + * this function is given the target Oid. + */ +Datum +test_consume_oids(PG_FUNCTION_ARGS) +{ + int32 oid = PG_GETARG_INT32(0); + + while (oid != GetNewObjectId()); + + PG_RETURN_VOID(); +} + /* * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. */ diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index f757a15fbb..67c4dd019e 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -717,8 +717,10 @@ mod tests { _ => panic!("wrong message"), } }); - let endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); let _creds = auth_quirks( &mut ctx, @@ -767,8 +769,10 @@ mod tests { frontend::password_message(b"my-secret-password", &mut write).unwrap(); client.write_all(&write).await.unwrap(); }); - let endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); let _creds = auth_quirks( &mut ctx, @@ -818,8 +822,10 @@ mod tests { client.write_all(&write).await.unwrap(); }); - let endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); let creds = auth_quirks( &mut ctx, diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 7f4cb2c010..c1fd6dfd80 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -22,7 +22,9 @@ use proxy::http; use proxy::http::health_server::AppMetrics; use proxy::metrics::Metrics; use proxy::rate_limiter::EndpointRateLimiter; +use proxy::rate_limiter::LeakyBucketConfig; use proxy::rate_limiter::RateBucketInfo; +use proxy::rate_limiter::WakeComputeRateLimiter; use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use proxy::redis::elasticache; @@ -176,6 +178,9 @@ struct ProxyCliArgs { /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) #[clap(long)] redis_notifications: Option, + /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". + #[clap(long, default_value = "irsa")] + redis_auth_type: String, /// redis host for streaming connections (might be different from the notifications host) #[clap(long)] redis_host: Option, @@ -319,24 +324,38 @@ async fn main() -> anyhow::Result<()> { ), aws_credentials_provider, )); - let regional_redis_client = match (args.redis_host, args.redis_port) { - (Some(host), Some(port)) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host, - port, - elasticache_credentials_provider.clone(), + let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { + ("plain", redis_url) => match redis_url { + None => { + bail!("plain auth requires redis_notifications to be set"); + } + Some(url) => Some( + ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()), ), - ), - (None, None) => { - warn!("Redis events from console are disabled"); - None - } + }, + ("irsa", _) => match (&args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host.to_string(), + port, + elasticache_credentials_provider.clone(), + ), + ), + (None, None) => { + warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }, _ => { - bail!("redis-host and redis-port must be specified together"); + bail!("unknown auth type given"); } }; + let redis_notifications_client = if let Some(url) = args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string())) } else { regional_redis_client.clone() }; @@ -373,9 +392,24 @@ async fn main() -> anyhow::Result<()> { proxy::metrics::CancellationSource::FromClient, )); - let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); - RateBucketInfo::validate(&mut endpoint_rps_limit)?; - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); + // bit of a hack - find the min rps and max rps supported and turn it into + // leaky bucket config instead + let max = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .max_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.max); + let rps = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .min_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.rps); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { rps, max }, + 64, + )); // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) @@ -577,7 +611,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); RateBucketInfo::validate(&mut wake_compute_rps_limit)?; let wake_compute_endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit)); + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); let api = console::provider::neon::Api::new( endpoint, caches, diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index a6e67be22f..768cd2fdfa 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -12,7 +12,7 @@ use crate::{ console::messages::{ColdStartInfo, Reason}, http, metrics::{CacheOutcome, Metrics}, - rate_limiter::EndpointRateLimiter, + rate_limiter::WakeComputeRateLimiter, scram, EndpointCacheKey, }; use crate::{cache::Cached, context::RequestMonitoring}; @@ -26,7 +26,7 @@ pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, pub locks: &'static ApiLocks, - pub wake_compute_endpoint_rate_limiter: Arc, + pub wake_compute_endpoint_rate_limiter: Arc, jwt: String, } @@ -36,7 +36,7 @@ impl Api { endpoint: http::Endpoint, caches: &'static ApiCaches, locks: &'static ApiLocks, - wake_compute_endpoint_rate_limiter: Arc, + wake_compute_endpoint_rate_limiter: Arc, ) -> Self { let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { Ok(v) => v, diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index cbfc9f1358..d96dd0947b 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -1,7 +1,7 @@ //! Man-in-the-middle tests //! //! Channel binding should prevent a proxy server -//! - that has access to create valid certificates - +//! *that has access to create valid certificates* //! from controlling the TLS connection. use std::fmt::Debug; diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index be9072dd8c..222cd431d2 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -3,4 +3,8 @@ mod limiter; pub use limit_algorithm::{ aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; -pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo}; +pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; +mod leaky_bucket; +pub use leaky_bucket::{ + EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState, +}; diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs new file mode 100644 index 0000000000..2d5e056540 --- /dev/null +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -0,0 +1,171 @@ +use std::{ + hash::Hash, + sync::atomic::{AtomicUsize, Ordering}, +}; + +use ahash::RandomState; +use dashmap::DashMap; +use rand::{thread_rng, Rng}; +use tokio::time::Instant; +use tracing::info; + +use crate::intern::EndpointIdInt; + +// Simple per-endpoint rate limiter. +pub type EndpointRateLimiter = LeakyBucketRateLimiter; + +pub struct LeakyBucketRateLimiter { + map: DashMap, + config: LeakyBucketConfig, + access_count: AtomicUsize, +} + +impl LeakyBucketRateLimiter { + pub const DEFAULT: LeakyBucketConfig = LeakyBucketConfig { + rps: 600.0, + max: 1500.0, + }; + + pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self { + Self { + map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards), + config, + access_count: AtomicUsize::new(0), + } + } + + /// Check that number of connections to the endpoint is below `max_rps` rps. + pub fn check(&self, key: K, n: u32) -> bool { + let now = Instant::now(); + + if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 { + self.do_gc(now); + } + + let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState { + time: now, + filled: 0.0, + }); + + entry.check(&self.config, now, n as f64) + } + + fn do_gc(&self, now: Instant) { + info!( + "cleaning up bucket rate limiter, current size = {}", + self.map.len() + ); + let n = self.map.shards().len(); + let shard = thread_rng().gen_range(0..n); + self.map.shards()[shard] + .write() + .retain(|_, value| !value.get_mut().update(&self.config, now)); + } +} + +pub struct LeakyBucketConfig { + pub rps: f64, + pub max: f64, +} + +pub struct LeakyBucketState { + filled: f64, + time: Instant, +} + +impl LeakyBucketConfig { + pub fn new(rps: f64, max: f64) -> Self { + assert!(rps > 0.0, "rps must be positive"); + assert!(max > 0.0, "max must be positive"); + Self { rps, max } + } +} + +impl LeakyBucketState { + pub fn new() -> Self { + Self { + filled: 0.0, + time: Instant::now(), + } + } + + /// updates the timer and returns true if the bucket is empty + fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool { + let drain = now.duration_since(self.time); + let drain = drain.as_secs_f64() * info.rps; + + self.filled = (self.filled - drain).clamp(0.0, info.max); + self.time = now; + + self.filled == 0.0 + } + + pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool { + self.update(info, now); + + if self.filled + n > info.max { + return false; + } + self.filled += n; + + true + } +} + +impl Default for LeakyBucketState { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use tokio::time::Instant; + + use super::{LeakyBucketConfig, LeakyBucketState}; + + #[tokio::test(start_paused = true)] + async fn check() { + let info = LeakyBucketConfig::new(500.0, 2000.0); + let mut bucket = LeakyBucketState::new(); + + // should work for 2000 requests this second + for _ in 0..2000 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + assert!(!bucket.check(&info, Instant::now(), 1.0)); + assert_eq!(bucket.filled, 2000.0); + + // in 1ms we should drain 0.5 tokens. + // make sure we don't lose any tokens + tokio::time::advance(Duration::from_millis(1)).await; + assert!(!bucket.check(&info, Instant::now(), 1.0)); + tokio::time::advance(Duration::from_millis(1)).await; + assert!(bucket.check(&info, Instant::now(), 1.0)); + + // in 10ms we should drain 5 tokens + tokio::time::advance(Duration::from_millis(10)).await; + for _ in 0..5 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + assert!(!bucket.check(&info, Instant::now(), 1.0)); + + // in 10s we should drain 5000 tokens + // but cap is only 2000 + tokio::time::advance(Duration::from_secs(10)).await; + for _ in 0..2000 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + assert!(!bucket.check(&info, Instant::now(), 1.0)); + + // should sustain 500rps + for _ in 0..2000 { + tokio::time::advance(Duration::from_millis(10)).await; + for _ in 0..5 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + } + } +} diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index b8c9490696..5db4efed37 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -61,7 +61,7 @@ impl GlobalRateLimiter { // Purposefully ignore user name and database name as clients can reconnect // with different names, so we'll end up sending some http requests to // the control plane. -pub type EndpointRateLimiter = BucketRateLimiter; +pub type WakeComputeRateLimiter = BucketRateLimiter; pub struct BucketRateLimiter { map: DashMap, Hasher>, @@ -103,7 +103,7 @@ pub struct RateBucketInfo { impl std::fmt::Display for RateBucketInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64; + let rps = self.rps().floor() as u64; write!(f, "{rps}@{}", humantime::format_duration(self.interval)) } } @@ -140,6 +140,10 @@ impl RateBucketInfo { Self::new(200, Duration::from_secs(600)), ]; + pub fn rps(&self) -> f64 { + (self.max_rpi as f64) / self.interval.as_secs_f64() + } + pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info @@ -245,7 +249,7 @@ mod tests { use rustc_hash::FxHasher; use tokio::time; - use super::{BucketRateLimiter, EndpointRateLimiter}; + use super::{BucketRateLimiter, WakeComputeRateLimiter}; use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId}; #[test] @@ -293,7 +297,7 @@ mod tests { .map(|s| s.parse().unwrap()) .collect(); RateBucketInfo::validate(&mut rates).unwrap(); - let limiter = EndpointRateLimiter::new(rates); + let limiter = WakeComputeRateLimiter::new(rates); let endpoint = EndpointId::from("ep-my-endpoint-1234"); let endpoint = EndpointIdInt::from(endpoint); diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index f2b794e5fe..e8e7ef5c86 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -158,7 +158,7 @@ mod tests { let N = 1021 * 4096; let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); - let memory = std::mem::size_of::() * sketch.buckets.len(); + let memory = size_of::() * sketch.buckets.len(); let time = sketch.depth; (memory, time) } diff --git a/rust-toolchain.toml b/rust-toolchain.toml index dcae25a287..3510359591 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.79.0" +channel = "1.80.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index cd3c7fe526..d574bb438f 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -27,7 +27,7 @@ pub const SK_FORMAT_VERSION: u32 = 9; pub const CONTROL_FILE_NAME: &str = "safekeeper.control"; // needed to atomically update the state using `rename` const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; -pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); +pub const CHECKSUM_SIZE: usize = size_of::(); /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index f45bfb95fa..2c519433ef 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -143,7 +143,12 @@ impl postgres_backend::Handler self.tenant_id.unwrap_or(TenantId::from([0u8; 16])), self.timeline_id.unwrap_or(TimelineId::from([0u8; 16])), ); - tracing::Span::current().record("ttid", tracing::field::display(ttid)); + tracing::Span::current() + .record("ttid", tracing::field::display(ttid)) + .record( + "application_name", + tracing::field::debug(self.appname.clone()), + ); Ok(()) } else { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 7ecee178f3..234273e133 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -545,7 +545,10 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { &cancel, ) .await? - .keys; + .keys + .into_iter() + .map(|o| o.key) + .collect::>(); if files.is_empty() { return Ok(()); // done } @@ -613,7 +616,7 @@ pub async fn copy_s3_segments( let uploaded_segments = &files .iter() - .filter_map(|file| file.object_name().map(ToOwned::to_owned)) + .filter_map(|o| o.key.object_name().map(ToOwned::to_owned)) .collect::>(); debug!( diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 091571111e..16f7748eb4 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -43,7 +43,7 @@ pub async fn task_main( error!("connection handler exited: {}", err); } } - .instrument(info_span!("", cid = %conn_id, ttid = field::Empty)), + .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)), ); } } diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs index aa329bd2f0..123cd6bad6 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -172,7 +172,7 @@ fn write_walrecord_to_disk( let mut freespace = insert_freespace(curr_ptr); let mut written: usize = 0; - assert!(freespace >= std::mem::size_of::()); + assert!(freespace >= size_of::()); for mut rdata in rdatas { while rdata.len() >= freespace { diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 14cda0a289..1bb9c17f30 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -22,7 +22,8 @@ struct HeartbeaterTask { state: HashMap, - max_unavailable_interval: Duration, + max_offline_interval: Duration, + max_warming_up_interval: Duration, jwt_token: Option, } @@ -31,7 +32,9 @@ pub(crate) enum PageserverState { Available { last_seen_at: Instant, utilization: PageserverUtilization, - new: bool, + }, + WarmingUp { + started_at: Instant, }, Offline, } @@ -57,12 +60,18 @@ pub(crate) struct Heartbeater { impl Heartbeater { pub(crate) fn new( jwt_token: Option, - max_unavailable_interval: Duration, + max_offline_interval: Duration, + max_warming_up_interval: Duration, cancel: CancellationToken, ) -> Self { let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::(); - let mut heartbeater = - HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel); + let mut heartbeater = HeartbeaterTask::new( + receiver, + jwt_token, + max_offline_interval, + max_warming_up_interval, + cancel, + ); tokio::task::spawn(async move { heartbeater.run().await }); Self { sender } @@ -88,14 +97,16 @@ impl HeartbeaterTask { fn new( receiver: tokio::sync::mpsc::UnboundedReceiver, jwt_token: Option, - max_unavailable_interval: Duration, + max_offline_interval: Duration, + max_warming_up_interval: Duration, cancel: CancellationToken, ) -> Self { Self { receiver, cancel, state: HashMap::new(), - max_unavailable_interval, + max_offline_interval, + max_warming_up_interval, jwt_token, } } @@ -128,16 +139,15 @@ impl HeartbeaterTask { heartbeat_futs.push({ let jwt_token = self.jwt_token.clone(); let cancel = self.cancel.clone(); - let new_node = !self.state.contains_key(node_id); // Clone the node and mark it as available such that the request // goes through to the pageserver even when the node is marked offline. // This doesn't impact the availability observed by [`crate::service::Service`]. - let mut node = node.clone(); - node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + let mut node_clone = node.clone(); + node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst())); async move { - let response = node + let response = node_clone .with_client_retries( |client| async move { client.get_utilization().await }, &jwt_token, @@ -161,7 +171,12 @@ impl HeartbeaterTask { PageserverState::Available { last_seen_at: Instant::now(), utilization, - new: new_node, + } + } else if let NodeAvailability::WarmingUp(last_seen_at) = + node.get_availability() + { + PageserverState::WarmingUp { + started_at: last_seen_at, } } else { PageserverState::Offline @@ -187,53 +202,67 @@ impl HeartbeaterTask { } } } + + let mut warming_up = 0; + let mut offline = 0; + for state in new_state.values() { + match state { + PageserverState::WarmingUp { .. } => { + warming_up += 1; + } + PageserverState::Offline { .. } => offline += 1, + PageserverState::Available { .. } => {} + } + } + tracing::info!( - "Heartbeat round complete for {} nodes, {} offline", + "Heartbeat round complete for {} nodes, {} warming-up, {} offline", new_state.len(), - new_state - .values() - .filter(|s| match s { - PageserverState::Available { .. } => { - false - } - PageserverState::Offline => true, - }) - .count() + warming_up, + offline ); let mut deltas = Vec::new(); let now = Instant::now(); - for (node_id, ps_state) in new_state { + for (node_id, ps_state) in new_state.iter_mut() { use std::collections::hash_map::Entry::*; - let entry = self.state.entry(node_id); + let entry = self.state.entry(*node_id); let mut needs_update = false; match entry { Occupied(ref occ) => match (occ.get(), &ps_state) { (PageserverState::Offline, PageserverState::Offline) => {} (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => { - if now - *last_seen_at >= self.max_unavailable_interval { - deltas.push((node_id, ps_state.clone())); + if now - *last_seen_at >= self.max_offline_interval { + deltas.push((*node_id, ps_state.clone())); needs_update = true; } } + (_, PageserverState::WarmingUp { started_at }) => { + if now - *started_at >= self.max_warming_up_interval { + *ps_state = PageserverState::Offline; + } + + deltas.push((*node_id, ps_state.clone())); + needs_update = true; + } _ => { - deltas.push((node_id, ps_state.clone())); + deltas.push((*node_id, ps_state.clone())); needs_update = true; } }, Vacant(_) => { // This is a new node. Don't generate a delta for it. - deltas.push((node_id, ps_state.clone())); + deltas.push((*node_id, ps_state.clone())); } } match entry { Occupied(mut occ) if needs_update => { - (*occ.get_mut()) = ps_state; + (*occ.get_mut()) = ps_state.clone(); } Vacant(vac) => { - vac.insert(ps_state); + vac.insert(ps_state.clone()); } _ => {} } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 8fb4be93e0..c77918827f 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -3,7 +3,7 @@ use crate::metrics::{ METRICS_REGISTRY, }; use crate::reconciler::ReconcileError; -use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; +use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT}; use anyhow::Context; use futures::Future; use hyper::header::CONTENT_TYPE; @@ -607,6 +607,13 @@ async fn handle_tenant_update_policy(mut req: Request) -> Result) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + json_response(StatusCode::OK, state.service.step_down().await) +} + async fn handle_tenant_drop(req: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; @@ -734,6 +741,47 @@ struct RequestMeta { at: Instant, } +pub fn prologue_leadership_status_check_middleware< + B: hyper::body::HttpBody + Send + Sync + 'static, +>() -> Middleware { + Middleware::pre(move |req| async move { + let state = get_state(&req); + let leadership_status = state.service.get_leadership_status(); + + enum AllowedRoutes<'a> { + All, + Some(Vec<&'a str>), + } + + let allowed_routes = match leadership_status { + LeadershipStatus::Leader => AllowedRoutes::All, + LeadershipStatus::SteppedDown => { + // TODO: does it make sense to allow /status here? + AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec()) + } + LeadershipStatus::Candidate => { + AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) + } + }; + + let uri = req.uri().to_string(); + match allowed_routes { + AllowedRoutes::All => Ok(req), + AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req), + _ => { + tracing::info!( + "Request {} not allowed due to current leadership state", + req.uri() + ); + + Err(ApiError::ResourceUnavailable( + format!("Current leadership status is {leadership_status}").into(), + )) + } + } + }) +} + fn prologue_metrics_middleware( ) -> Middleware { Middleware::pre(move |req| async move { @@ -820,6 +868,7 @@ pub fn make_router( build_info: BuildInfo, ) -> RouterBuilder { let mut router = endpoint::make_router() + .middleware(prologue_leadership_status_check_middleware()) .middleware(prologue_metrics_middleware()) .middleware(epilogue_metrics_middleware()); if auth.is_some() { @@ -971,6 +1020,9 @@ pub fn make_router( RequestName("control_v1_tenant_policy"), ) }) + .put("/control/v1/step_down", |r| { + named_request_span(r, handle_step_down, RequestName("control_v1_step_down")) + }) // Tenant operations // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 789f96beb3..adbf5c6496 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -10,7 +10,8 @@ use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::{ - Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, + RECONCILER_CONCURRENCY_DEFAULT, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -61,7 +62,12 @@ struct Cli { /// Grace period before marking unresponsive pageserver offline #[arg(long)] - max_unavailable_interval: Option, + max_offline_interval: Option, + + /// More tolerant grace period before marking unresponsive pagserver offline used + /// around pageserver restarts + #[arg(long)] + max_warming_up_interval: Option, /// Size threshold for automatically splitting shards (disabled by default) #[arg(long)] @@ -254,10 +260,14 @@ async fn async_main() -> anyhow::Result<()> { jwt_token: secrets.jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, compute_hook_url: args.compute_hook_url, - max_unavailable_interval: args - .max_unavailable_interval + max_offline_interval: args + .max_offline_interval .map(humantime::Duration::into) - .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT), + .unwrap_or(MAX_OFFLINE_INTERVAL_DEFAULT), + max_warming_up_interval: args + .max_warming_up_interval + .map(humantime::Duration::into) + .unwrap_or(MAX_WARMING_UP_INTERVAL_DEFAULT), reconciler_concurrency: args .reconciler_concurrency .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index ac9f22c739..a1a4b8543d 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -13,7 +13,10 @@ use metrics::NeonMetrics; use once_cell::sync::Lazy; use std::sync::Mutex; -use crate::persistence::{DatabaseError, DatabaseOperation}; +use crate::{ + persistence::{DatabaseError, DatabaseOperation}, + service::LeadershipStatus, +}; pub(crate) static METRICS_REGISTRY: Lazy = Lazy::new(StorageControllerMetrics::default); @@ -81,6 +84,8 @@ pub(crate) struct StorageControllerMetricGroup { #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_database_query_latency: measured::HistogramVec, + + pub(crate) storage_controller_leadership_status: measured::GaugeVec, } impl StorageControllerMetrics { @@ -156,6 +161,12 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup { pub(crate) operation: DatabaseOperation, } +#[derive(measured::LabelGroup)] +#[label(set = LeadershipStatusGroupSet)] +pub(crate) struct LeadershipStatusGroup { + pub(crate) status: LeadershipStatus, +} + #[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { #[label(rename = "ok")] diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index fff44aaf26..ea765ca123 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration}; use pageserver_api::{ controller_api::{ NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, - TenantLocateResponseShard, UtilizationScore, + TenantLocateResponseShard, }, shard::TenantShardId, }; @@ -46,6 +46,8 @@ pub(crate) struct Node { /// whether/how they changed it. pub(crate) enum AvailabilityTransition { ToActive, + ToWarmingUpFromActive, + ToWarmingUpFromOffline, ToOffline, Unchanged, } @@ -90,22 +92,34 @@ impl Node { } } + pub(crate) fn get_availability(&self) -> NodeAvailability { + self.availability + } + pub(crate) fn set_availability(&mut self, availability: NodeAvailability) { + use AvailabilityTransition::*; + use NodeAvailability::WarmingUp; + match self.get_availability_transition(availability) { - AvailabilityTransition::ToActive => { + ToActive => { // Give the node a new cancellation token, effectively resetting it to un-cancelled. Any // users of previously-cloned copies of the node will still see the old cancellation // state. For example, Reconcilers in flight will have to complete and be spawned // again to realize that the node has become available. self.cancel = CancellationToken::new(); } - AvailabilityTransition::ToOffline => { + ToOffline | ToWarmingUpFromActive => { // Fire the node's cancellation token to cancel any in-flight API requests to it self.cancel.cancel(); } - AvailabilityTransition::Unchanged => {} + Unchanged | ToWarmingUpFromOffline => {} + } + + if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) { + self.availability = WarmingUp(std::cmp::max(crnt, proposed)); + } else { + self.availability = availability; } - self.availability = availability; } /// Without modifying the availability of the node, convert the intended availability @@ -120,16 +134,10 @@ impl Node { match (self.availability, availability) { (Offline, Active(_)) => ToActive, (Active(_), Offline) => ToOffline, - // Consider the case when the storage controller handles the re-attach of a node - // before the heartbeats detect that the node is back online. We still need - // [`Service::node_configure`] to attempt reconciliations for shards with an - // unknown observed location. - // The unsavoury match arm below handles this situation. - (Active(lhs), Active(rhs)) - if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() => - { - ToActive - } + (Active(_), WarmingUp(_)) => ToWarmingUpFromActive, + (WarmingUp(_), Offline) => ToOffline, + (WarmingUp(_), Active(_)) => ToActive, + (Offline, WarmingUp(_)) => ToWarmingUpFromOffline, _ => Unchanged, } } @@ -147,7 +155,7 @@ impl Node { pub(crate) fn may_schedule(&self) -> MaySchedule { let score = match self.availability { NodeAvailability::Active(score) => score, - NodeAvailability::Offline => return MaySchedule::No, + NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No, }; match self.scheduling { diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 886ceae90f..12dea2c7ef 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -12,6 +12,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; +use utils::failpoint_support; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use utils::lsn::Lsn; @@ -749,6 +750,8 @@ impl Reconciler { self.location_config(&node, conf, None, false).await?; } + failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue"); + Ok(()) } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index a163453dca..821f45d0c0 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -15,6 +15,7 @@ use crate::{ }, compute_hook::NotifyError, id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, + metrics::LeadershipStatusGroup, persistence::{AbortShardSplitStatus, TenantFilter}, reconciler::{ReconcileError, ReconcileUnits}, scheduler::{MaySchedule, ScheduleContext, ScheduleMode}, @@ -81,6 +82,7 @@ use crate::{ ReconcilerWaiter, TenantShard, }, }; +use serde::{Deserialize, Serialize}; // For operations that should be quick, like attaching a new tenant const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); @@ -100,9 +102,13 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); /// How long a node may be unresponsive to heartbeats before we declare it offline. /// This must be long enough to cover node restarts as well as normal operations: in future -/// it should be separated into distinct timeouts for startup vs. normal operation -/// (``) -pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); +pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); + +/// How long a node may be unresponsive to heartbeats during start up before we declare it +/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's +/// handling of the re-attach response may take a long time and blocks heartbeats from +/// being handled on the pageserver side. +pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); #[derive(Clone, strum_macros::Display)] enum TenantOperations { @@ -127,6 +133,24 @@ enum NodeOperations { Delete, } +/// The leadership status for the storage controller process. +/// Allowed transitions are: +/// 1. Leader -> SteppedDown +/// 2. Candidate -> Leader +#[derive(Copy, Clone, strum_macros::Display, measured::FixedCardinalityLabel)] +#[strum(serialize_all = "snake_case")] +pub(crate) enum LeadershipStatus { + /// This is the steady state where the storage controller can produce + /// side effects in the cluster. + Leader, + /// We've been notified to step down by another candidate. No reconciliations + /// take place in this state. + SteppedDown, + /// Initial state for a new storage controller instance. Will attempt to assume leadership. + #[allow(unused)] + Candidate, +} + pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. @@ -136,6 +160,8 @@ const MAX_DELAYED_RECONCILES: usize = 10000; // Top level state available to all HTTP handlers struct ServiceState { + leadership_status: LeadershipStatus, + tenants: BTreeMap, nodes: Arc>, @@ -198,7 +224,21 @@ impl ServiceState { scheduler: Scheduler, delayed_reconcile_rx: tokio::sync::mpsc::Receiver, ) -> Self { + let status = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_leadership_status; + + status.set( + LeadershipStatusGroup { + status: LeadershipStatus::Leader, + }, + 1, + ); + Self { + // TODO: Starting up as Leader is a transient state. Once we enable rolling + // upgrades on the k8s side, we should start up as Candidate. + leadership_status: LeadershipStatus::Leader, tenants, nodes: Arc::new(nodes), scheduler, @@ -216,6 +256,37 @@ impl ServiceState { ) { (&mut self.nodes, &mut self.tenants, &mut self.scheduler) } + + fn get_leadership_status(&self) -> LeadershipStatus { + self.leadership_status + } + + fn step_down(&mut self) { + self.leadership_status = LeadershipStatus::SteppedDown; + + let status = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_leadership_status; + + status.set( + LeadershipStatusGroup { + status: LeadershipStatus::SteppedDown, + }, + 1, + ); + status.set( + LeadershipStatusGroup { + status: LeadershipStatus::Leader, + }, + 0, + ); + status.set( + LeadershipStatusGroup { + status: LeadershipStatus::Candidate, + }, + 0, + ); + } } #[derive(Clone)] @@ -236,7 +307,12 @@ pub struct Config { /// Grace period within which a pageserver does not respond to heartbeats, but is still /// considered active. Once the grace period elapses, the next heartbeat failure will /// mark the pagseserver offline. - pub max_unavailable_interval: Duration, + pub max_offline_interval: Duration, + + /// Extended grace period within which pageserver may not respond to heartbeats. + /// This extended grace period kicks in after the node has been drained for restart + /// and/or upon handling the re-attach request from a node. + pub max_warming_up_interval: Duration, /// How many Reconcilers may be spawned concurrently pub reconciler_concurrency: usize, @@ -269,7 +345,7 @@ pub struct Service { config: Config, persistence: Arc, compute_hook: Arc, - result_tx: tokio::sync::mpsc::UnboundedSender, + result_tx: tokio::sync::mpsc::UnboundedSender, heartbeater: Heartbeater, @@ -299,9 +375,15 @@ pub struct Service { // Process shutdown will fire this token cancel: CancellationToken, + // Child token of [`Service::cancel`] used by reconcilers + reconcilers_cancel: CancellationToken, + // Background tasks will hold this gate gate: Gate, + // Reconcilers background tasks will hold this gate + reconcilers_gate: Gate, + /// This waits for initial reconciliation with pageservers to complete. Until this barrier /// passes, it isn't safe to do any actions that mutate tenants. pub(crate) startup_complete: Barrier, @@ -388,6 +470,30 @@ struct ShardUpdate { generation: Option, } +enum StopReconciliationsReason { + ShuttingDown, + SteppingDown, +} + +impl std::fmt::Display for StopReconciliationsReason { + fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result { + let s = match self { + Self::ShuttingDown => "Shutting down", + Self::SteppingDown => "Stepping down", + }; + write!(writer, "{}", s) + } +} + +pub(crate) enum ReconcileResultRequest { + ReconcileResult(ReconcileResult), + Stop, +} + +// TODO: move this into the storcon peer client when that gets added +#[derive(Serialize, Deserialize, Debug, Default)] +pub(crate) struct GlobalObservedState(HashMap); + impl Service { pub fn get_config(&self) -> &Config { &self.config @@ -587,6 +693,9 @@ impl Service { online_nodes.insert(node_id, utilization); } PageserverState::Offline => {} + PageserverState::WarmingUp { .. } => { + unreachable!("Nodes are never marked warming-up during startup reconcile") + } } } } @@ -741,7 +850,7 @@ impl Service { const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20); let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD); - while !self.cancel.is_cancelled() { + while !self.reconcilers_cancel.is_cancelled() { tokio::select! { _ = interval.tick() => { let reconciles_spawned = self.reconcile_all(); @@ -754,7 +863,7 @@ impl Service { } } } - _ = self.cancel.cancelled() => return + _ = self.reconcilers_cancel.cancelled() => return } } } @@ -779,61 +888,54 @@ impl Service { let res = self.heartbeater.heartbeat(nodes).await; if let Ok(deltas) = res { for (node_id, state) in deltas.0 { - let (new_node, new_availability) = match state { - PageserverState::Available { - utilization, new, .. - } => ( - new, - NodeAvailability::Active(UtilizationScore( - utilization.utilization_score, - )), + let new_availability = match state { + PageserverState::Available { utilization, .. } => NodeAvailability::Active( + UtilizationScore(utilization.utilization_score), ), - PageserverState::Offline => (false, NodeAvailability::Offline), + PageserverState::WarmingUp { started_at } => { + NodeAvailability::WarmingUp(started_at) + } + PageserverState::Offline => { + // The node might have been placed in the WarmingUp state + // while the heartbeat round was on-going. Hence, filter out + // offline transitions for WarmingUp nodes that are still within + // their grace period. + if let Ok(NodeAvailability::WarmingUp(started_at)) = + self.get_node(node_id).await.map(|n| n.get_availability()) + { + let now = Instant::now(); + if now - started_at >= self.config.max_warming_up_interval { + NodeAvailability::Offline + } else { + NodeAvailability::WarmingUp(started_at) + } + } else { + NodeAvailability::Offline + } + } }; - if new_node { - // When the heartbeats detect a newly added node, we don't wish - // to attempt to reconcile the shards assigned to it. The node - // is likely handling it's re-attach response, so reconciling now - // would be counterproductive. - // - // Instead, update the in-memory state with the details learned about the - // node. - let mut locked = self.inner.write().unwrap(); - let (nodes, _tenants, scheduler) = locked.parts_mut(); + // This is the code path for geniune availability transitions (i.e node + // goes unavailable and/or comes back online). + let res = self + .node_configure(node_id, Some(new_availability), None) + .await; - let mut new_nodes = (**nodes).clone(); - - if let Some(node) = new_nodes.get_mut(&node_id) { - node.set_availability(new_availability); - scheduler.node_upsert(node); + match res { + Ok(()) => {} + Err(ApiError::NotFound(_)) => { + // This should be rare, but legitimate since the heartbeats are done + // on a snapshot of the nodes. + tracing::info!("Node {} was not found after heartbeat round", node_id); } - - locked.nodes = Arc::new(new_nodes); - } else { - // This is the code path for geniune availability transitions (i.e node - // goes unavailable and/or comes back online). - let res = self - .node_configure(node_id, Some(new_availability), None) - .await; - - match res { - Ok(()) => {} - Err(ApiError::NotFound(_)) => { - // This should be rare, but legitimate since the heartbeats are done - // on a snapshot of the nodes. - tracing::info!( - "Node {} was not found after heartbeat round", - node_id - ); - } - Err(err) => { - tracing::error!( - "Failed to update node {} after heartbeat round: {}", - node_id, - err - ); - } + Err(err) => { + // Transition to active involves reconciling: if a node responds to a heartbeat then + // becomes unavailable again, we may get an error here. + tracing::error!( + "Failed to update node {} after heartbeat round: {}", + node_id, + err + ); } } } @@ -932,7 +1034,7 @@ impl Service { async fn process_results( &self, - mut result_rx: tokio::sync::mpsc::UnboundedReceiver, + mut result_rx: tokio::sync::mpsc::UnboundedReceiver, mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver< Result<(), (TenantShardId, NotifyError)>, >, @@ -942,8 +1044,8 @@ impl Service { tokio::select! { r = result_rx.recv() => { match r { - Some(result) => {self.process_result(result);}, - None => {break;} + Some(ReconcileResultRequest::ReconcileResult(result)) => {self.process_result(result);}, + None | Some(ReconcileResultRequest::Stop) => {break;} } } _ = async{ @@ -969,9 +1071,6 @@ impl Service { } }; } - - // We should only fall through on shutdown - assert!(self.cancel.is_cancelled()); } async fn process_aborts( @@ -1148,9 +1247,12 @@ impl Service { tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES); let cancel = CancellationToken::new(); + let reconcilers_cancel = cancel.child_token(); + let heartbeater = Heartbeater::new( config.jwt_token.clone(), - config.max_unavailable_interval, + config.max_offline_interval, + config.max_warming_up_interval, cancel.clone(), ); let this = Arc::new(Self { @@ -1172,7 +1274,9 @@ impl Service { abort_tx, startup_complete: startup_complete.clone(), cancel, + reconcilers_cancel, gate: Gate::default(), + reconcilers_gate: Gate::default(), tenant_op_locks: Default::default(), node_op_locks: Default::default(), }); @@ -1662,21 +1766,23 @@ impl Service { | NodeSchedulingPolicy::Filling ); - if !node.is_available() || reset_scheduling { - let mut new_nodes = (**nodes).clone(); - if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) { - if !node.is_available() { - node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); - } - - if reset_scheduling { - node.set_scheduling(NodeSchedulingPolicy::Active); - } - - scheduler.node_upsert(node); - let new_nodes = Arc::new(new_nodes); - *nodes = new_nodes; + let mut new_nodes = (**nodes).clone(); + if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) { + if reset_scheduling { + node.set_scheduling(NodeSchedulingPolicy::Active); } + + tracing::info!("Marking {} warming-up on reattach", reattach_req.node_id); + node.set_availability(NodeAvailability::WarmingUp(std::time::Instant::now())); + + scheduler.node_upsert(node); + let new_nodes = Arc::new(new_nodes); + *nodes = new_nodes; + } else { + tracing::error!( + "Reattaching node {} was removed while processing the request", + reattach_req.node_id + ); } } @@ -4717,6 +4823,15 @@ impl Service { // TODO: in the background, we should balance work back onto this pageserver } + // No action required for the intermediate unavailable state. + // When we transition into active or offline from the unavailable state, + // the correct handling above will kick in. + AvailabilityTransition::ToWarmingUpFromActive => { + tracing::info!("Node {} transition to unavailable from active", node_id); + } + AvailabilityTransition::ToWarmingUpFromOffline => { + tracing::info!("Node {} transition to unavailable from offline", node_id); + } AvailabilityTransition::Unchanged => { tracing::debug!("Node {} no availability change during config", node_id); } @@ -5115,7 +5230,7 @@ impl Service { } }; - let Ok(gate_guard) = self.gate.enter() else { + let Ok(gate_guard) = self.reconcilers_gate.enter() else { // Gate closed: we're shutting down, drop out. return None; }; @@ -5128,7 +5243,7 @@ impl Service { &self.persistence, units, gate_guard, - &self.cancel, + &self.reconcilers_cancel, ) } @@ -5574,18 +5689,27 @@ impl Service { Ok(std::cmp::max(waiter_count, reconciles_spawned)) } + async fn stop_reconciliations(&self, reason: StopReconciliationsReason) { + // Cancel all on-going reconciles and wait for them to exit the gate. + tracing::info!("{reason}: cancelling and waiting for in-flight reconciles"); + self.reconcilers_cancel.cancel(); + self.reconcilers_gate.close().await; + + // Signal the background loop in [`Service::process_results`] to exit once + // it has proccessed the results from all the reconciles we cancelled earlier. + tracing::info!("{reason}: processing results from previously in-flight reconciles"); + self.result_tx.send(ReconcileResultRequest::Stop).ok(); + self.result_tx.closed().await; + } + pub async fn shutdown(&self) { - // Note that this already stops processing any results from reconciles: so - // we do not expect that our [`TenantShard`] objects will reach a neat - // final state. + self.stop_reconciliations(StopReconciliationsReason::ShuttingDown) + .await; + + // Background tasks hold gate guards: this notifies them of the cancellation and + // waits for them all to complete. + tracing::info!("Shutting down: cancelling and waiting for background tasks to exit"); self.cancel.cancel(); - - // The cancellation tokens in [`crate::reconciler::Reconciler`] are children - // of our cancellation token, so we do not need to explicitly cancel each of - // them. - - // Background tasks and reconcilers hold gate guards: this waits for them all - // to complete. self.gate.close().await; } @@ -5970,4 +6094,27 @@ impl Service { Ok(()) } + + pub(crate) fn get_leadership_status(&self) -> LeadershipStatus { + self.inner.read().unwrap().get_leadership_status() + } + + pub(crate) async fn step_down(&self) -> GlobalObservedState { + tracing::info!("Received step down request from peer"); + + self.inner.write().unwrap().step_down(); + // TODO: would it make sense to have a time-out for this? + self.stop_reconciliations(StopReconciliationsReason::SteppingDown) + .await; + + let mut global_observed = GlobalObservedState::default(); + let locked = self.inner.read().unwrap(); + for (tid, tenant_shard) in locked.tenants.iter() { + global_observed + .0 + .insert(*tid, tenant_shard.observed.clone()); + } + + global_observed + } } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index ee2ba6c4ee..e250f29f98 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -9,6 +9,7 @@ use crate::{ persistence::TenantShardPersistence, reconciler::ReconcileUnits, scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext}, + service::ReconcileResultRequest, }; use pageserver_api::controller_api::{ NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, @@ -17,7 +18,7 @@ use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, shard::{ShardIdentity, TenantShardId}, }; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::{instrument, Instrument}; @@ -283,7 +284,7 @@ impl Drop for IntentState { } } -#[derive(Default, Clone, Serialize)] +#[derive(Default, Clone, Serialize, Deserialize, Debug)] pub(crate) struct ObservedState { pub(crate) locations: HashMap, } @@ -297,7 +298,7 @@ pub(crate) struct ObservedState { /// what it is (e.g. we failed partway through configuring it) /// * Instance exists with conf==Some: this tells us what we last successfully configured on this node, /// and that configuration will still be present unless something external interfered. -#[derive(Clone, Serialize)] +#[derive(Clone, Serialize, Deserialize, Debug)] pub(crate) struct ObservedStateLocation { /// If None, it means we do not know the status of this shard's location on this node, but /// we know that we might have some state on this node. @@ -1059,7 +1060,7 @@ impl TenantShard { #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( &mut self, - result_tx: &tokio::sync::mpsc::UnboundedSender, + result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, service_config: &service::Config, @@ -1183,7 +1184,9 @@ impl TenantShard { pending_compute_notification: reconciler.compute_notify_failure, }; - result_tx.send(result).ok(); + result_tx + .send(ReconcileResultRequest::ReconcileResult(result)) + .ok(); } .instrument(reconciler_span), ); diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index 5233afbebe..7d5b7d10b9 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -49,6 +49,5 @@ tracing.workspace = true tracing-subscriber.workspace = true clap.workspace = true tracing-appender = "0.2" -histogram = "0.7" futures.workspace = true diff --git a/storage_scrubber/README.md b/storage_scrubber/README.md index 0930f343ec..9fbd92feef 100644 --- a/storage_scrubber/README.md +++ b/storage_scrubber/README.md @@ -45,7 +45,11 @@ processing by the `purge-garbage` subcommand. Example: -`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=[client_key] CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` + +Note that `CLOUD_ADMIN_API_TOKEN` can be obtained from https://console-stage.neon.build/app/settings/api-keys (for staging) or https://console.neon.tech/app/settings/api-keys for production. This is not the control plane admin JWT key. The env var name is confusing. Though anyone can generate that API key, you still need admin permission in order to access all projects in the region. + +And note that `CLOUD_ADMIN_API_URL` should include the region in the admin URL due to the control plane / console split. For example, `https://console-stage.neon.build/regions/aws-us-east-2/api/v1/admin` for the staging us-east-2 region. #### `purge-garbage` @@ -61,7 +65,7 @@ to pass them on the command line Example: -`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev cargo run --release -- purge-garbage --input-path=eu-west-1-garbage.json` Add the `--delete` argument before `purge-garbage` to enable deletion. This is intentionally not provided inline in the example above to avoid accidents. Without the `--delete` flag diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index f687b24320..a35a58aedd 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -2,6 +2,7 @@ use std::collections::{HashMap, HashSet}; use anyhow::Context; use aws_sdk_s3::Client; +use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver_api::shard::ShardIndex; use tracing::{error, info, warn}; @@ -12,7 +13,7 @@ use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; use futures_util::StreamExt; -use pageserver::tenant::remote_timeline_client::parse_remote_index_path; +use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use remote_storage::RemotePath; @@ -41,7 +42,9 @@ impl TimelineAnalysis { } } -pub(crate) fn branch_cleanup_and_check_errors( +pub(crate) async fn branch_cleanup_and_check_errors( + s3_client: &Client, + target: &RootTarget, id: &TenantShardTimelineId, tenant_objects: &mut TenantObjectListing, s3_active_branch: Option<&BranchData>, @@ -84,16 +87,19 @@ pub(crate) fn branch_cleanup_and_check_errors( .push(format!("index_part.json version: {}", index_part.version())) } - if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() { - result.warnings.push(format!( + let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2); + if !newest_versions.any(|ip| ip == &index_part.version()) { + info!( "index_part.json version is not latest: {}", index_part.version() - )) + ); } if index_part.metadata.disk_consistent_lsn() != index_part.duplicated_disk_consistent_lsn() { + // Tech debt: let's get rid of one of these, they are redundant + // https://github.com/neondatabase/neon/issues/8343 result.errors.push(format!( "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})", index_part.metadata.disk_consistent_lsn(), @@ -102,8 +108,16 @@ pub(crate) fn branch_cleanup_and_check_errors( } if index_part.layer_metadata.is_empty() { - // not an error, can happen for branches with zero writes, but notice that - info!("index_part.json has no layers"); + if index_part.metadata.ancestor_timeline().is_none() { + // The initial timeline with no ancestor should ALWAYS have layers. + result.errors.push( + "index_part.json has no layers (ancestor_timeline=None)" + .to_string(), + ); + } else { + // Not an error, can happen for branches with zero writes, but notice that + info!("index_part.json has no layers (ancestor_timeline exists)"); + } } for (layer, metadata) in index_part.layer_metadata { @@ -114,16 +128,41 @@ pub(crate) fn branch_cleanup_and_check_errors( } if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) { - // FIXME: this will emit false positives if an index was - // uploaded concurrently with our scan. To make this check - // correct, we need to try sending a HEAD request for the - // layer we think is missing. - result.errors.push(format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage", - layer, - metadata.generation.get_suffix(), - metadata.shard - )) + let path = remote_layer_path( + &id.tenant_shard_id.tenant_id, + &id.timeline_id, + metadata.shard, + &layer, + metadata.generation, + ); + + // HEAD request used here to address a race condition when an index was uploaded concurrently + // with our scan. We check if the object is uploaded to S3 after taking the listing snapshot. + let response = s3_client + .head_object() + .bucket(target.bucket_name()) + .key(path.get_path().as_str()) + .send() + .await; + + if response.is_err() { + // Object is not present. + let is_l0 = LayerMap::is_l0(layer.key_range()); + + let msg = format!( + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + layer, + metadata.generation.get_suffix(), + metadata.shard, + is_l0, + ); + + if is_l0 { + result.warnings.push(msg); + } else { + result.errors.push(msg); + } + } } } } @@ -303,6 +342,9 @@ pub(crate) async fn list_timeline_blobs( tracing::debug!("initdb archive {key}"); initdb_archive = true; } + Some("initdb-preserved.tar.zst") => { + tracing::info!("initdb archive preserved {key}"); + } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { Ok((new_layer, gen)) => { tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen); diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index 0450851988..333269ec7e 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -8,21 +8,19 @@ use std::{ }; use anyhow::Context; -use aws_sdk_s3::{ - types::{Delete, ObjectIdentifier}, - Client, -}; use futures_util::TryStreamExt; use pageserver_api::shard::TenantShardId; +use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; +use tokio_util::sync::CancellationToken; use utils::id::TenantId; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, - init_remote, - metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants}, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth, + init_remote, init_remote_generic, + metadata_stream::{stream_tenant_timelines, stream_tenants}, + BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, }; #[derive(Serialize, Deserialize, Debug)] @@ -324,41 +322,45 @@ impl std::fmt::Display for PurgeMode { } pub async fn get_tenant_objects( - s3_client: &Arc, - target: RootTarget, + s3_client: &GenericRemoteStorage, tenant_shard_id: TenantShardId, -) -> anyhow::Result> { +) -> anyhow::Result> { tracing::debug!("Listing objects in tenant {tenant_shard_id}"); + let tenant_root = super::remote_tenant_path(&tenant_shard_id); + // TODO: apply extra validation based on object modification time. Don't purge // tenants where any timeline's index_part.json has been touched recently. - let mut tenant_root = target.tenant_root(&tenant_shard_id); - - // Remove delimiter, so that object listing lists all keys in the prefix and not just - // common prefixes. - tenant_root.delimiter = String::new(); - - let key_stream = stream_listing(s3_client, &tenant_root); - key_stream.try_collect().await + let list = s3_client + .list( + Some(&tenant_root), + ListingMode::NoDelimiter, + None, + &CancellationToken::new(), + ) + .await?; + Ok(list.keys) } pub async fn get_timeline_objects( - s3_client: &Arc, - target: RootTarget, + s3_client: &GenericRemoteStorage, ttid: TenantShardTimelineId, -) -> anyhow::Result> { +) -> anyhow::Result> { tracing::debug!("Listing objects in timeline {ttid}"); - let mut timeline_root = target.timeline_root(&ttid); + let timeline_root = super::remote_timeline_path_id(&ttid); // TODO: apply extra validation based on object modification time. Don't purge // timelines whose index_part.json has been touched recently. - // Remove delimiter, so that object listing lists all keys in the prefix and not just - // common prefixes. - timeline_root.delimiter = String::new(); - let key_stream = stream_listing(s3_client, &timeline_root); - - key_stream.try_collect().await + let list = s3_client + .list( + Some(&timeline_root), + ListingMode::NoDelimiter, + None, + &CancellationToken::new(), + ) + .await?; + Ok(list.keys) } const MAX_KEYS_PER_DELETE: usize = 1000; @@ -369,16 +371,19 @@ const MAX_KEYS_PER_DELETE: usize = 1000; /// MAX_KEYS_PER_DELETE keys are left. /// `num_deleted` returns number of deleted keys. async fn do_delete( - s3_client: &Arc, - bucket_name: &str, - keys: &mut Vec, + remote_client: &GenericRemoteStorage, + keys: &mut Vec, dry_run: bool, drain: bool, progress_tracker: &mut DeletionProgressTracker, ) -> anyhow::Result<()> { + let cancel = CancellationToken::new(); while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) { let request_keys = keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len()))); + + let request_keys: Vec = request_keys.into_iter().map(|o| o.key).collect(); + let num_deleted = request_keys.len(); if dry_run { tracing::info!("Dry-run deletion of objects: "); @@ -386,14 +391,10 @@ async fn do_delete( tracing::info!(" {k:?}"); } } else { - let delete_request = s3_client - .delete_objects() - .bucket(bucket_name) - .delete(Delete::builder().set_objects(Some(request_keys)).build()?); - delete_request - .send() + remote_client + .delete_objects(&request_keys, &cancel) .await - .context("DeleteObjects request")?; + .context("deletetion request")?; progress_tracker.register(num_deleted); } } @@ -431,8 +432,13 @@ pub async fn purge_garbage( input_path ); - let (s3_client, target) = - init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?; + let remote_client = + init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?; + + assert_eq!( + &garbage_list.bucket_config.bucket, + remote_client.bucket_name().unwrap() + ); // Sanity checks on the incoming list if garbage_list.active_tenant_count == 0 { @@ -464,16 +470,13 @@ pub async fn purge_garbage( let items = tokio_stream::iter(filtered_items.map(Ok)); let get_objects_results = items.map_ok(|i| { - let s3_client = s3_client.clone(); - let target = target.clone(); + let remote_client = remote_client.clone(); async move { match i.entity { GarbageEntity::Tenant(tenant_id) => { - get_tenant_objects(&s3_client, target, tenant_id).await - } - GarbageEntity::Timeline(ttid) => { - get_timeline_objects(&s3_client, target, ttid).await + get_tenant_objects(&remote_client, tenant_id).await } + GarbageEntity::Timeline(ttid) => get_timeline_objects(&remote_client, ttid).await, } } }); @@ -487,8 +490,7 @@ pub async fn purge_garbage( objects_to_delete.append(&mut object_list); if objects_to_delete.len() >= MAX_KEYS_PER_DELETE { do_delete( - &s3_client, - &garbage_list.bucket_config.bucket, + &remote_client, &mut objects_to_delete, dry_run, false, @@ -499,8 +501,7 @@ pub async fn purge_garbage( } do_delete( - &s3_client, - &garbage_list.bucket_config.bucket, + &remote_client, &mut objects_to_delete, dry_run, true, diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index a0b6d7ea30..5c64e7e459 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -22,9 +22,13 @@ use aws_sdk_s3::Client; use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; +use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver::tenant::TENANTS_SEGMENT_NAME; use pageserver_api::shard::TenantShardId; -use remote_storage::RemotePath; +use remote_storage::{ + GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, + DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, +}; use reqwest::Url; use serde::{Deserialize, Serialize}; use tokio::io::AsyncReadExt; @@ -215,6 +219,10 @@ impl RootTarget { } } +pub fn remote_timeline_path_id(id: &TenantShardTimelineId) -> RemotePath { + remote_timeline_path(&id.tenant_shard_id, &id.timeline_id) +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct BucketConfig { @@ -296,7 +304,7 @@ pub fn init_logging(file_name: &str) -> Option { } } -pub async fn init_s3_client(bucket_region: Region) -> Client { +async fn init_s3_client(bucket_region: Region) -> Client { let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28()) .region(bucket_region) .load() @@ -304,6 +312,13 @@ pub async fn init_s3_client(bucket_region: Region) -> Client { Client::new(&config) } +fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str { + match node_kind { + NodeKind::Pageserver => "pageserver/v1/", + NodeKind::Safekeeper => "wal/", + } +} + async fn init_remote( bucket_config: BucketConfig, node_kind: NodeKind, @@ -311,18 +326,17 @@ async fn init_remote( let bucket_region = Region::new(bucket_config.region); let delimiter = "/".to_string(); let s3_client = Arc::new(init_s3_client(bucket_region).await); + let default_prefix = default_prefix_in_bucket(node_kind).to_string(); let s3_root = match node_kind { NodeKind::Pageserver => RootTarget::Pageserver(S3Target { bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config - .prefix_in_bucket - .unwrap_or("pageserver/v1".to_string()), + prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix), delimiter, }), NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target { bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()), + prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix), delimiter, }), }; @@ -330,6 +344,31 @@ async fn init_remote( Ok((s3_client, s3_root)) } +async fn init_remote_generic( + bucket_config: BucketConfig, + node_kind: NodeKind, +) -> anyhow::Result { + let endpoint = env::var("AWS_ENDPOINT_URL").ok(); + let default_prefix = default_prefix_in_bucket(node_kind).to_string(); + let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix)); + let storage = S3Config { + bucket_name: bucket_config.bucket, + bucket_region: bucket_config.region, + prefix_in_bucket, + endpoint, + concurrency_limit: DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT + .try_into() + .unwrap(), + max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + upload_storage_class: None, + }; + let storage_config = RemoteStorageConfig { + storage: RemoteStorageKind::AwsS3(storage), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + }; + GenericRemoteStorage::from_config(&storage_config).await +} + async fn list_objects_with_retries( s3_client: &Client, s3_target: &S3Target, diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index df4f29acf7..fbd60f93bb 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -8,12 +8,11 @@ use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use aws_sdk_s3::Client; use futures_util::{StreamExt, TryStreamExt}; -use histogram::Histogram; use pageserver::tenant::remote_timeline_client::remote_layer_path; -use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; use serde::Serialize; use utils::id::TenantId; +use utils::shard::ShardCount; #[derive(Serialize)] pub struct MetadataSummary { @@ -24,66 +23,6 @@ pub struct MetadataSummary { with_warnings: HashSet, with_orphans: HashSet, indices_by_version: HashMap, - - layer_count: MinMaxHisto, - timeline_size_bytes: MinMaxHisto, - layer_size_bytes: MinMaxHisto, -} - -/// A histogram plus minimum and maximum tracking -#[derive(Serialize)] -struct MinMaxHisto { - #[serde(skip)] - histo: Histogram, - min: u64, - max: u64, -} - -impl MinMaxHisto { - fn new() -> Self { - Self { - histo: histogram::Histogram::builder() - .build() - .expect("Bad histogram params"), - min: u64::MAX, - max: 0, - } - } - - fn sample(&mut self, v: u64) -> Result<(), histogram::Error> { - self.min = std::cmp::min(self.min, v); - self.max = std::cmp::max(self.max, v); - let r = self.histo.increment(v, 1); - - if r.is_err() { - tracing::warn!("Bad histogram sample: {v}"); - } - - r - } - - fn oneline(&self) -> String { - let percentiles = match self.histo.percentiles(&[1.0, 10.0, 50.0, 90.0, 99.0]) { - Ok(p) => p, - Err(e) => return format!("No data: {}", e), - }; - - let percentiles: Vec = percentiles - .iter() - .map(|p| p.bucket().low() + p.bucket().high() / 2) - .collect(); - - format!( - "min {}, 1% {}, 10% {}, 50% {}, 90% {}, 99% {}, max {}", - self.min, - percentiles[0], - percentiles[1], - percentiles[2], - percentiles[3], - percentiles[4], - self.max, - ) - } } impl MetadataSummary { @@ -96,25 +35,9 @@ impl MetadataSummary { with_warnings: HashSet::new(), with_orphans: HashSet::new(), indices_by_version: HashMap::new(), - layer_count: MinMaxHisto::new(), - timeline_size_bytes: MinMaxHisto::new(), - layer_size_bytes: MinMaxHisto::new(), } } - fn update_histograms(&mut self, index_part: &IndexPart) -> Result<(), histogram::Error> { - self.layer_count - .sample(index_part.layer_metadata.len() as u64)?; - let mut total_size: u64 = 0; - for meta in index_part.layer_metadata.values() { - total_size += meta.file_size; - self.layer_size_bytes.sample(meta.file_size)?; - } - self.timeline_size_bytes.sample(total_size)?; - - Ok(()) - } - fn update_data(&mut self, data: &S3TimelineBlobData) { self.timeline_shard_count += 1; if let BlobDataParseResult::Parsed { @@ -127,14 +50,6 @@ impl MetadataSummary { .indices_by_version .entry(index_part.version()) .or_insert(0) += 1; - - if let Err(e) = self.update_histograms(index_part) { - // Value out of range? Warn that the results are untrustworthy - tracing::warn!( - "Error updating histograms, summary stats may be wrong: {}", - e - ); - } } } @@ -169,9 +84,6 @@ With errors: {} With warnings: {} With orphan layers: {} Index versions: {version_summary} -Timeline size bytes: {} -Layer size bytes: {} -Timeline layer count: {} ", self.tenant_count, self.timeline_count, @@ -179,9 +91,6 @@ Timeline layer count: {} self.with_errors.len(), self.with_warnings.len(), self.with_orphans.len(), - self.timeline_size_bytes.oneline(), - self.layer_size_bytes.oneline(), - self.layer_count.oneline(), ) } @@ -235,33 +144,60 @@ pub async fn scan_metadata( let mut tenant_objects = TenantObjectListing::default(); let mut tenant_timeline_results = Vec::new(); - fn analyze_tenant( + async fn analyze_tenant( + s3_client: &Client, + target: &RootTarget, tenant_id: TenantId, summary: &mut MetadataSummary, mut tenant_objects: TenantObjectListing, timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>, + highest_shard_count: ShardCount, ) { summary.tenant_count += 1; let mut timeline_ids = HashSet::new(); let mut timeline_generations = HashMap::new(); for (ttid, data) in timelines { - timeline_ids.insert(ttid.timeline_id); - // Stash the generation of each timeline, for later use identifying orphan layers - if let BlobDataParseResult::Parsed { - index_part: _index_part, - index_part_generation, - s3_layers: _s3_layers, - } = &data.blob_data - { - timeline_generations.insert(ttid, *index_part_generation); - } + if ttid.tenant_shard_id.shard_count == highest_shard_count { + // Only analyze `TenantShardId`s with highest shard count. - // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects` - // reference counts for layers across the tenant. - let analysis = - branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data)); - summary.update_analysis(&ttid, &analysis); + // Stash the generation of each timeline, for later use identifying orphan layers + if let BlobDataParseResult::Parsed { + index_part, + index_part_generation, + s3_layers: _s3_layers, + } = &data.blob_data + { + if index_part.deleted_at.is_some() { + // skip deleted timeline. + tracing::info!("Skip analysis of {} b/c timeline is already deleted", ttid); + continue; + } + timeline_generations.insert(ttid, *index_part_generation); + } + + // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects` + // reference counts for layers across the tenant. + let analysis = branch_cleanup_and_check_errors( + s3_client, + target, + &ttid, + &mut tenant_objects, + None, + None, + Some(data), + ) + .await; + summary.update_analysis(&ttid, &analysis); + + timeline_ids.insert(ttid.timeline_id); + } else { + tracing::info!( + "Skip analysis of {} b/c a lower shard count than {}", + ttid, + highest_shard_count.0, + ); + } } summary.timeline_count += timeline_ids.len(); @@ -309,18 +245,35 @@ pub async fn scan_metadata( // all results for the same tenant will be adjacent. We accumulate these, // and then call `analyze_tenant` to flush, when we see the next tenant ID. let mut summary = MetadataSummary::new(); + let mut highest_shard_count = ShardCount::MIN; while let Some(i) = timelines.next().await { let (ttid, data) = i?; summary.update_data(&data); match tenant_id { - None => tenant_id = Some(ttid.tenant_shard_id.tenant_id), + None => { + tenant_id = Some(ttid.tenant_shard_id.tenant_id); + highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count); + } Some(prev_tenant_id) => { if prev_tenant_id != ttid.tenant_shard_id.tenant_id { + // New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results let tenant_objects = std::mem::take(&mut tenant_objects); let timelines = std::mem::take(&mut tenant_timeline_results); - analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines); + analyze_tenant( + &s3_client, + &target, + prev_tenant_id, + &mut summary, + tenant_objects, + timelines, + highest_shard_count, + ) + .await; tenant_id = Some(ttid.tenant_shard_id.tenant_id); + highest_shard_count = ttid.tenant_shard_id.shard_count; + } else { + highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count); } } } @@ -338,11 +291,15 @@ pub async fn scan_metadata( if !tenant_timeline_results.is_empty() { analyze_tenant( + &s3_client, + &target, tenant_id.expect("Must be set if results are present"), &mut summary, tenant_objects, tenant_timeline_results, - ); + highest_shard_count, + ) + .await; } Ok(summary) diff --git a/test_runner/README.md b/test_runner/README.md index 7d95634ea8..e2f26a19ce 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -81,7 +81,7 @@ should go. Useful parameters and commands: `--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk -after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. +after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. If `NeonEnvBuilder#preserve_database_files` set to `True` for a particular test, the whole `repo` directory will be attached to Allure report (thus uploaded to S3) as `everything.tar.zst` for this test. Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them: `./scripts/pytest -s --log-cli-level=INFO ...` diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 038f557cc8..0c36cd6ef7 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -222,6 +222,8 @@ class NeonBenchmarker: function by the zenbenchmark fixture """ + PROPERTY_PREFIX = "neon_benchmarker_" + def __init__(self, property_recorder: Callable[[str, object], None]): # property recorder here is a pytest fixture provided by junitxml module # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property @@ -238,7 +240,7 @@ class NeonBenchmarker: Record a benchmark result. """ # just to namespace the value - name = f"neon_benchmarker_{metric_name}" + name = f"{self.PROPERTY_PREFIX}_{metric_name}" self.property_recorder( name, { @@ -249,6 +251,18 @@ class NeonBenchmarker: }, ) + @classmethod + def records( + cls, user_properties: list[tuple[str, object]] + ) -> Iterator[tuple[str, dict[str, object]]]: + """ + Yield all records related to benchmarks + """ + for property_name, recorded_property in user_properties: + if property_name.startswith(cls.PROPERTY_PREFIX): + assert isinstance(recorded_property, dict) + yield recorded_property["name"], recorded_property + @contextmanager def record_duration(self, metric_name: str) -> Iterator[None]: """ @@ -425,10 +439,11 @@ def zenbenchmark( yield benchmarker results = {} - for _, recorded_property in request.node.user_properties: + for _, recorded_property in NeonBenchmarker.records(request.node.user_properties): name = recorded_property["name"] value = str(recorded_property["value"]) - if (unit := recorded_property["unit"].strip()) != "": + unit = str(recorded_property["unit"]).strip() + if unit != "": value += f" {unit}" results[name] = value @@ -477,7 +492,7 @@ def pytest_terminal_summary( for test_report in terminalreporter.stats.get("passed", []): result_entry = [] - for _, recorded_property in test_report.user_properties: + for _, recorded_property in NeonBenchmarker.records(test_report.user_properties): if not is_header_printed: terminalreporter.section("Benchmark results", "-") is_header_printed = True diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 147264762c..b63dfd4e47 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -143,6 +143,9 @@ class TimelineId(Id): def __repr__(self) -> str: return f'TimelineId("{self.id.hex()}")' + def __str__(self) -> str: + return self.id.hex() + # Workaround for compat with python 3.9, which does not have `typing.Self` TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId") diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 4836d42db5..509f41366b 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -133,6 +133,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( *histogram("pageserver_remote_operation_seconds"), *histogram("pageserver_io_operations_seconds"), "pageserver_tenant_states_count", + "pageserver_circuit_breaker_broken_total", + "pageserver_circuit_breaker_unbroken_total", ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9e39457c06..c5fffc2af6 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -523,7 +523,7 @@ class NeonEnvBuilder: self.preserve_database_files = preserve_database_files self.initial_tenant = initial_tenant or TenantId.generate() self.initial_timeline = initial_timeline or TimelineId.generate() - self.scrub_on_exit = False + self.enable_scrub_on_exit = True self.test_output_dir = test_output_dir self.test_overlay_dir = test_overlay_dir self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = [] @@ -542,21 +542,6 @@ class NeonEnvBuilder: f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}" ) - self.pageserver_get_vectored_impl: Optional[str] = None - if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored": - self.pageserver_get_vectored_impl = "vectored" - log.debug('Overriding pageserver get_vectored_impl config to "vectored"') - - self.pageserver_get_impl: Optional[str] = None - if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored": - self.pageserver_get_impl = "vectored" - log.debug('Overriding pageserver get_impl config to "vectored"') - - self.pageserver_validate_vectored_get: Optional[bool] = None - if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None: - self.pageserver_validate_vectored_get = bool(validate) - log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"') - self.pageserver_aux_file_policy = pageserver_aux_file_policy self.safekeeper_extra_opts = safekeeper_extra_opts @@ -852,6 +837,13 @@ class NeonEnvBuilder: ) ident_state_dir.rmdir() # should be empty since we moved `upper` out + def disable_scrub_on_exit(self): + """ + Some tests intentionally leave the remote storage contents empty or corrupt, + so it doesn't make sense to do the usual scrub at the end of the test. + """ + self.enable_scrub_on_exit = False + def overlay_cleanup_teardown(self): """ Unmount the overlayfs mounts created by `self.overlay_mount()`. @@ -877,23 +869,6 @@ class NeonEnvBuilder: # assert all overlayfs mounts in our test directory are gone assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir)) - def enable_scrub_on_exit(self): - """ - Call this if you would like the fixture to automatically run - storage_scrubber at the end of the test, as a bidirectional test - that the scrubber is working properly, and that the code within - the test didn't produce any invalid remote state. - """ - - if not isinstance(self.pageserver_remote_storage, S3Storage): - # The scrubber can't talk to e.g. LocalFS -- it needs - # an HTTP endpoint (mock is fine) to connect to. - raise RuntimeError( - "Cannot scrub with remote_storage={self.pageserver_remote_storage}, require an S3 endpoint" - ) - - self.scrub_on_exit = True - def enable_pageserver_remote_storage( self, remote_storage_kind: RemoteStorageKind, @@ -995,7 +970,12 @@ class NeonEnvBuilder: ) cleanup_error = None - if self.scrub_on_exit: + # If we are running with S3Storage (required by the scrubber), check that whatever the test + # did does not generate any corruption + if ( + isinstance(self.env.pageserver_remote_storage, S3Storage) + and self.enable_scrub_on_exit + ): try: self.env.storage_scrubber.scan_metadata() except Exception as e: @@ -1162,12 +1142,6 @@ class NeonEnv: } if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine - if config.pageserver_get_vectored_impl is not None: - ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl - if config.pageserver_get_impl is not None: - ps_cfg["get_impl"] = config.pageserver_get_impl - if config.pageserver_validate_vectored_get is not None: - ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get if config.pageserver_default_tenant_config_compaction_algorithm is not None: tenant_config = ps_cfg.setdefault("tenant_config", {}) tenant_config[ @@ -1420,7 +1394,7 @@ def _shared_simple_env( pg_distrib_dir=pg_distrib_dir, pg_version=pg_version, run_id=run_id, - preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), test_name=request.node.name, test_output_dir=test_output_dir, pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, @@ -1467,6 +1441,7 @@ def neon_env_builder( pageserver_virtual_file_io_engine: str, pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], pageserver_aux_file_policy: Optional[AuxFileStore], + record_property: Callable[[str, object], None], ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1495,7 +1470,7 @@ def neon_env_builder( pg_version=pg_version, broker=default_broker, run_id=run_id, - preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, test_name=request.node.name, test_output_dir=test_output_dir, @@ -1504,6 +1479,9 @@ def neon_env_builder( pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, ) as builder: yield builder + # Propogate `preserve_database_files` to make it possible to use in other fixtures, + # like `test_output_dir` fixture for attaching all database files to Allure report. + record_property("preserve_database_files", builder.preserve_database_files) @dataclass @@ -2148,6 +2126,23 @@ class StorageControllerApiException(Exception): self.status_code = status_code +# See libs/pageserver_api/src/controller_api.rs +# for the rust definitions of the enums below +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class PageserverAvailability(str, Enum): + ACTIVE = "Active" + UNAVAILABLE = "Unavailable" + OFFLINE = "Offline" + + +class PageserverSchedulingPolicy(str, Enum): + ACTIVE = "Active" + DRAINING = "Draining" + FILLING = "Filling" + PAUSE = "Pause" + PAUSE_FOR_RESTART = "PauseForRestart" + + class NeonStorageController(MetricsGetter, LogUtils): def __init__(self, env: NeonEnv, auth_enabled: bool): self.env = env @@ -2531,26 +2526,54 @@ class NeonStorageController(MetricsGetter, LogUtils): ) log.info("storage controller passed consistency check") + def node_registered(self, node_id: int) -> bool: + """ + Returns true if the storage controller can confirm + it knows of pageserver with 'node_id' + """ + try: + self.node_status(node_id) + except StorageControllerApiException as e: + if e.status_code == 404: + return False + else: + raise e + + return True + def poll_node_status( - self, node_id: int, desired_scheduling_policy: str, max_attempts: int, backoff: int + self, + node_id: int, + desired_availability: Optional[PageserverAvailability], + desired_scheduling_policy: Optional[PageserverSchedulingPolicy], + max_attempts: int, + backoff: int, ): """ - Poll the node status until it reaches 'desired_scheduling_policy' or 'max_attempts' have been exhausted + Poll the node status until it reaches 'desired_scheduling_policy' and 'desired_availability' + or 'max_attempts' have been exhausted """ - log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy") + log.info( + f"Polling {node_id} for {desired_scheduling_policy} scheduling policy and {desired_availability} availability" + ) while max_attempts > 0: try: status = self.node_status(node_id) policy = status["scheduling"] - if policy == desired_scheduling_policy: + availability = status["availability"] + if (desired_scheduling_policy is None or policy == desired_scheduling_policy) and ( + desired_availability is None or availability == desired_availability + ): return else: max_attempts -= 1 - log.info(f"Status call returned {policy=} ({max_attempts} attempts left)") + log.info( + f"Status call returned {policy=} {availability=} ({max_attempts} attempts left)" + ) if max_attempts == 0: raise AssertionError( - f"Status for {node_id=} did not reach {desired_scheduling_policy=}" + f"Status for {node_id=} did not reach {desired_scheduling_policy=} {desired_availability=}" ) time.sleep(backoff) @@ -2563,6 +2586,17 @@ class NeonStorageController(MetricsGetter, LogUtils): time.sleep(backoff) + def step_down(self): + log.info("Asking storage controller to step down") + response = self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/step_down", + headers=self.headers(TokenScope.ADMIN), + ) + + response.raise_for_status() + return response.json() + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): if isinstance(config_strings, tuple): pairs = [config_strings] @@ -2694,6 +2728,14 @@ class NeonPageserver(PgProtocol, LogUtils): self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds ) self.running = True + + if self.env.storage_controller.running and self.env.storage_controller.node_registered( + self.id + ): + self.env.storage_controller.poll_node_status( + self.id, PageserverAvailability.ACTIVE, None, max_attempts=20, backoff=1 + ) + return self def stop(self, immediate: bool = False) -> "NeonPageserver": @@ -4440,7 +4482,16 @@ def test_output_dir( yield test_dir - allure_attach_from_dir(test_dir) + preserve_database_files = False + for k, v in request.node.user_properties: + # NB: the neon_env_builder fixture uses this fixture (test_output_dir). + # So, neon_env_builder's cleanup runs before here. + # The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property. + if k == "preserve_database_files": + assert isinstance(v, bool) + preserve_database_files = v + + allure_attach_from_dir(test_dir, preserve_database_files) class FileAndThreadLock: diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index c5b09e3608..dff002bd4b 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -102,6 +102,7 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ # failing to connect to them. ".*Call to node.*management API.*failed.*receive body.*", ".*Call to node.*management API.*failed.*ReceiveBody.*", + ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", # Tests run in dev mode diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index c7cea4ec04..c6df6b5baf 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -662,6 +662,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): force_repartition=False, force_image_layer_creation=False, wait_until_uploaded=False, + compact: Optional[bool] = None, ): self.is_testing_enabled_or_skip() query = {} @@ -672,6 +673,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): if wait_until_uploaded: query["wait_until_uploaded"] = "true" + if compact is not None: + query["compact"] = "true" if compact else "false" + log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index c437258c6f..3e0ffabf74 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -1,5 +1,4 @@ import concurrent.futures -import time from typing import Any, Callable, Dict, Tuple import fixtures.pageserver.remote_storage @@ -9,9 +8,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) -from fixtures.pageserver.utils import ( - wait_until_tenant_state, -) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind @@ -46,38 +42,33 @@ def single_timeline( log.info(f"duplicating template tenant {ncopies} times in S3") tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) + # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. + # However, on-demand downloads are quite slow ATM. + # => do the on-demand downloads in Python. + log.info("python-side on-demand download the layer files into local tenant dir") + tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) + fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( + env, tenant_timelines + ) + log.info("attach duplicated tenants to pageserver") # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. # However, on-demand downloads are quite slow ATM. # => do the on-demand downloads in Python. assert ps_http.tenant_list() == [] - # make the attach fail after it created enough on-disk state to retry loading - # the tenant next startup, but before it can start background loops that would start download - ps_http.configure_failpoints(("attach-before-activate", "return")) - env.pageserver.allowed_errors.append( - ".*attach failed, setting tenant state to Broken: attach-before-activate.*" - ) - def attach_broken(tenant): + def attach(tenant): env.pageserver.tenant_attach( tenant, config=template_config.copy(), generation=100, override_storage_controller_generation=True, ) - time.sleep(0.1) - wait_until_tenant_state(ps_http, tenant, "Broken", 10) with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: - executor.map(attach_broken, tenants) + executor.map(attach, tenants) - env.pageserver.stop( - immediate=True - ) # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout - tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) - log.info("python-side on-demand download the layer files into local tenant dir") - fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( - env, tenant_timelines - ) + # Benchmarks will start the pageserver explicitly themselves + env.pageserver.stop() return env diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 6f6526d3fc..0f2a997b1e 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -12,8 +12,9 @@ import boto3 import toml from mypy_boto3_s3 import S3Client -from fixtures.common_types import TenantId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log +from fixtures.pageserver.common_types import IndexPartDump TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @@ -265,9 +266,38 @@ class S3Storage: def tenants_path(self) -> str: return f"{self.prefix_in_bucket}/tenants" - def tenant_path(self, tenant_id: TenantId) -> str: + def tenant_path(self, tenant_id: Union[TenantShardId, TenantId]) -> str: return f"{self.tenants_path()}/{tenant_id}" + def timeline_path( + self, tenant_id: Union[TenantShardId, TenantId], timeline_id: TimelineId + ) -> str: + return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + + def get_latest_index_key(self, index_keys: List[str]) -> str: + """ + Gets the latest index file key. + + @param index_keys: A list of index keys of different generations. + """ + + def parse_gen(index_key: str) -> int: + parts = index_key.split("index_part.json-") + return int(parts[-1], base=16) if len(parts) == 2 else -1 + + return max(index_keys, key=parse_gen) + + def download_index_part(self, index_key: str) -> IndexPartDump: + """ + Downloads the index content from remote storage. + + @param index_key: index key in remote storage. + """ + response = self.client.get_object(Bucket=self.bucket_name, Key=index_key) + body = response["Body"].read().decode("utf-8") + log.info(f"index_part.json: {body}") + return IndexPartDump.from_json(json.loads(body)) + def heatmap_key(self, tenant_id: TenantId) -> str: return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 0989dc1893..7f54eb0b0a 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -240,9 +240,18 @@ ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] ) -def allure_attach_from_dir(dir: Path): +def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False): """Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report""" + if preserve_database_files: + zst_file = dir.with_suffix(".tar.zst") + with zst_file.open("wb") as zst: + cctx = zstandard.ZstdCompressor() + with cctx.stream_writer(zst) as compressor: + with tarfile.open(fileobj=compressor, mode="w") as tar: + tar.add(dir, arcname="") + allure.attach.file(zst_file, "everything.tar.zst", "application/zstd", "tar.zst") + for attachment in Path(dir).glob("**/*"): if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0: name = str(attachment.relative_to(dir)) diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index b41ae60197..3258d4dcfa 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -17,13 +17,11 @@ from performance.pageserver.util import ( @pytest.mark.parametrize("duration", [30]) @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) @pytest.mark.parametrize("n_tenants", [10]) -@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"]) @pytest.mark.timeout(1000) def test_basebackup_with_high_slru_count( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, - get_vectored_impl: str, n_tenants: int, pgbench_scale: int, duration: int, @@ -47,7 +45,7 @@ def test_basebackup_with_high_slru_count( max_file_descriptors = 500000 neon_env_builder.pageserver_config_override = ( f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; " - f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false" + f"get_vectored_impl='vectored'; validate_vectored_get=false" ) params.update( { diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index 077b76104c..3c6f0b0131 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -135,7 +135,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare): # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which # this memory estimate can be revised far downwards to something that doesn't scale # linearly with the layer sizes. - MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25 + MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.5 # If we find that compaction is using more memory, this may indicate a regression assert compaction_mapped_rss < MEMORY_ESTIMATE diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 3a6113706f..281c9271e9 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -8,7 +8,12 @@ import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PageserverAvailability, + PageserverSchedulingPolicy, +) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pg_version import PgVersion @@ -106,7 +111,8 @@ def test_storage_controller_many_tenants( # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to # guard against regressions in restart time. - "max_unavailable": "300s" + "max_offline": "30s", + "max_warming_up": "300s", } neon_env_builder.control_plane_compute_hook_api = ( compute_reconfigure_listener.control_plane_compute_hook_api @@ -274,7 +280,11 @@ def test_storage_controller_many_tenants( ) env.storage_controller.poll_node_status( - ps.id, "PauseForRestart", max_attempts=24, backoff=5 + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=24, + backoff=5, ) shard_counts = get_consistent_node_shard_counts(env, total_shards) @@ -285,12 +295,24 @@ def test_storage_controller_many_tenants( assert sum(shard_counts.values()) == total_shards ps.restart() - env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=1) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=24, + backoff=1, + ) env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 ) - env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=5) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=24, + backoff=5, + ) shard_counts = get_consistent_node_shard_counts(env, total_shards) log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 32c1c52eea..354fc15745 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.64" +version = "0.10.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" +checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" dependencies = [ "bitflags 2.6.0", "cfg-if", @@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.102" +version = "0.9.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2" +checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" dependencies = [ "cc", "libc", diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 61afd820ca..5ec9a22ba1 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -17,22 +17,17 @@ from fixtures.pg_version import PgVersion # Test restarting page server, while safekeeper and compute node keep # running. def test_local_corruption(neon_env_builder: NeonEnvBuilder): - if neon_env_builder.pageserver_get_impl == "vectored": - reconstruct_function_name = "get_values_reconstruct_data" - else: - reconstruct_function_name = "get_value_reconstruct_data" - env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend( [ - f".*{reconstruct_function_name} for layer .*", + ".*get_values_reconstruct_data for layer .*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", ".*failed to load metadata.*", ".*load failed.*load local timeline.*", - ".*layer loading failed permanently: load layer: .*", + ".*: layer load failed, assuming permanent failure:.*", ] ) @@ -79,7 +74,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err: + with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err: pg1.start() log.info( f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}" diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 4d2cdb8e32..34791e5988 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -3,9 +3,16 @@ import asyncio from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.remote_storage import RemoteStorageKind +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response -def test_change_pageserver(neon_env_builder: NeonEnvBuilder): +def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver): + """ + A relatively low level test of reconfiguring a compute's pageserver at runtime. Usually this + is all done via the storage controller, but this test will disable the storage controller's compute + notifications, and instead update endpoints directly. + """ num_connections = 3 neon_env_builder.num_pageservers = 2 @@ -14,9 +21,24 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_start() + neon_env_builder.control_plane_compute_hook_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + ) + + def ignore_notify(request: Request): + # This test does direct updates to compute configuration: disable the storage controller's notification + log.info(f"Ignoring storage controller compute notification: {request.json}") + return Response(status=200) + + make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler( + ignore_notify + ) + env.neon_cli.create_branch("test_change_pageserver") endpoint = env.endpoints.create_start("test_change_pageserver") + # Put this tenant into a dual-attached state + assert env.get_tenant_pageserver(env.initial_tenant) == env.pageservers[0] alt_pageserver_id = env.pageservers[1].id env.pageservers[1].tenant_attach(env.initial_tenant) @@ -72,6 +94,7 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): env.pageservers[ 0 ].stop() # Stop the old pageserver just to make sure we're reading from the new one + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) execute("SELECT count(*) FROM foo") assert fetchone() == (100000,) @@ -82,9 +105,10 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): # # Since we're dual-attached, need to tip-off storage controller to treat the one we're # about to start as the attached pageserver - env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id) env.pageservers[0].start() env.pageservers[1].stop() + env.storage_controller.node_configure(env.pageservers[1].id, {"availability": "Offline"}) + env.storage_controller.reconcile_until_idle() endpoint.reconfigure(pageserver_id=env.pageservers[0].id) @@ -92,10 +116,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): assert fetchone() == (100000,) env.pageservers[0].stop() - # Since we're dual-attached, need to tip-off storage controller to treat the one we're - # about to start as the attached pageserver - env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id) env.pageservers[1].start() + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + env.storage_controller.reconcile_until_idle() # Test a (former) bug where a child process spins without updating its connection string # by executing a query separately. This query will hang until we issue the reconfigure. diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 65649e0c0a..411b20b2c4 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -227,12 +227,6 @@ def test_forward_compatibility( ) try: - # Previous version neon_local and pageserver are not aware - # of the new config. - # TODO: remove these once the previous version of neon local supports them - neon_env_builder.pageserver_get_impl = None - neon_env_builder.pageserver_validate_vectored_get = None - neon_env_builder.num_safekeepers = 3 # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.). diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 3c834f430b..85616c3fe2 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -21,6 +21,10 @@ from fixtures.utils import human_bytes, wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" +# access times in the pageserver are stored at a very low resolution: to generate meaningfully different +# values, tests must inject sleeps +ATIME_RESOLUTION = 2 + @pytest.mark.parametrize("config_level_override", [None, 400]) def test_min_resident_size_override_handling( @@ -67,14 +71,11 @@ def test_min_resident_size_override_handling( @enum.unique class EvictionOrder(str, enum.Enum): - ABSOLUTE_ORDER = "absolute" RELATIVE_ORDER_EQUAL = "relative_equal" RELATIVE_ORDER_SPARE = "relative_spare" def config(self) -> Dict[str, Any]: - if self == EvictionOrder.ABSOLUTE_ORDER: - return {"type": "AbsoluteAccessed"} - elif self == EvictionOrder.RELATIVE_ORDER_EQUAL: + if self == EvictionOrder.RELATIVE_ORDER_EQUAL: return { "type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}, @@ -384,7 +385,7 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_evicts_until_pressure_is_relieved( eviction_env: EvictionEnv, order: EvictionOrder @@ -418,7 +419,7 @@ def test_pageserver_evicts_until_pressure_is_relieved( @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_respects_overridden_resident_size( eviction_env: EvictionEnv, order: EvictionOrder @@ -495,7 +496,7 @@ def test_pageserver_respects_overridden_resident_size( @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder): """ @@ -526,7 +527,6 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E @pytest.mark.parametrize( "order", [ - EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE, ], @@ -550,6 +550,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): (tenant_id, timeline_id) = warm # make picked tenant more recently used than the other one + time.sleep(ATIME_RESOLUTION) env.warm_up_tenant(tenant_id) # Build up enough pressure to require evictions from both tenants, @@ -572,63 +573,38 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): later_tenant_usage < du_by_timeline[tenant] ), "all tenants should have lost some layers" - warm_size = later_du_by_timeline[warm] - cold_size = later_du_by_timeline[cold] + # with relative order what matters is the amount of layers, with a + # fudge factor of whether the eviction bothers tenants with highest + # layer count the most. last accessed times between tenants does not + # matter. + assert order in [EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE] + layers_now = env.count_layers_per_tenant(env.pageserver) - if order == EvictionOrder.ABSOLUTE_ORDER: - # bounds for warmed_size - warm_lower = 0.5 * du_by_timeline[warm] + expected_ratio = later_total_on_disk / total_on_disk + log.info( + f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio" + ) - # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. - # So, check for up to 3 here. - warm_upper = warm_lower + 3 * env.layer_size + for tenant_id, original_count in tenant_layers.items(): + count_now = layers_now[tenant_id] + ratio = count_now / original_count + abs_diff = abs(ratio - expected_ratio) + assert original_count > count_now - cold_upper = 2 * env.layer_size - log.info(f"tenants: warm={warm[0]}, cold={cold[0]}") + expectation = 0.065 log.info( - f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" + f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" ) - log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") - - assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" - assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" - - assert ( - cold_size < cold_upper - ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" - else: - # with relative order what matters is the amount of layers, with a - # fudge factor of whether the eviction bothers tenants with highest - # layer count the most. last accessed times between tenants does not - # matter. - layers_now = env.count_layers_per_tenant(env.pageserver) - - expected_ratio = later_total_on_disk / total_on_disk - log.info( - f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio" - ) - - for tenant_id, original_count in tenant_layers.items(): - count_now = layers_now[tenant_id] - ratio = count_now / original_count - abs_diff = abs(ratio - expected_ratio) - assert original_count > count_now - - expectation = 0.06 - log.info( - f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" - ) - # in this test case both relative_spare and relative_equal produce - # the same outcomes; this must be a quantization effect of similar - # sizes (-s4 and -s6) and small (5MB) layer size. - # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 - assert abs_diff < expectation + # in this test case both relative_spare and relative_equal produce + # the same outcomes; this must be a quantization effect of similar + # sizes (-s4 and -s6) and small (5MB) layer size. + # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 + assert abs_diff < expectation @pytest.mark.parametrize( "order", [ - EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE, ], @@ -651,6 +627,10 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or for scale in [1, 1, 1, 4]: timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale)) + # Eviction times are stored at a low resolution. We must ensure that the time between + # tenants is long enough for the pageserver to distinguish them. + time.sleep(ATIME_RESOLUTION) + env.neon_cli.safekeeper_stop() for (tenant_id, timeline_id), scale in timelines: @@ -680,14 +660,7 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order" log.info(f"{ratios}") - if order == EvictionOrder.ABSOLUTE_ORDER: - # first tenant loses most - assert ratios[0] <= ratios[1], "first should lose the most" - assert ratios[1] < ratios[2], "second should lose some" - assert ratios[1] < 1.0 - assert ratios[2] <= ratios[3], "third might not lose" - assert ratios[3] == 1.0, "tenant created last does not lose" - elif order == EvictionOrder.RELATIVE_ORDER_EQUAL: + if order == EvictionOrder.RELATIVE_ORDER_EQUAL: assert all([x for x in ratios if x < 1.0]), "all tenants lose layers" elif order == EvictionOrder.RELATIVE_ORDER_SPARE: # with different layer sizes and pg versions, there are different combinations @@ -750,7 +723,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): "type": "Failure", "mocked_error": "EIO", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO") @@ -784,7 +757,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) wait_until( @@ -837,7 +810,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) wait_until( diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py new file mode 100644 index 0000000000..a94ae99ed9 --- /dev/null +++ b/test_runner/regress/test_oid_overflow.py @@ -0,0 +1,45 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_oid_overflow(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + endpoint = env.endpoints.create_start("main") + + conn = endpoint.connect() + cur = conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("CREATE TABLE t1(x integer)") + cur.execute("INSERT INTO t1 values (1)") + cur.execute("CREATE TABLE t2(x integer)") + cur.execute("INSERT INTO t2 values (2)") + + cur.execute("SELECT x from t1") + assert cur.fetchone() == (1,) + cur.execute("SELECT x from t2") + assert cur.fetchone() == (2,) + + cur.execute("VACUUM FULL t1") + cur.execute("VACUUM FULL t1") + cur.execute("vacuum pg_class") + cur.execute("SELECT relfilenode FROM pg_class where relname='t1'") + oid = cur.fetchall()[0][0] + log.info(f"t1.relfilenode={oid}") + + cur.execute("set statement_timeout=0") + cur.execute(f"select test_consume_oids({oid-1})") + cur.execute("VACUUM FULL t2") + + cur.execute("SELECT relfilenode FROM pg_class where relname='t2'") + oid = cur.fetchall()[0][0] + log.info(f"t2.relfilenode={oid}") + + cur.execute("SELECT clear_buffer_cache()") + + cur.execute("SELECT x from t1") + assert cur.fetchone() == (1,) + cur.execute("SELECT x from t2") + assert cur.fetchone() == (2,) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 041942cda3..8941ddd281 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -596,19 +596,26 @@ def test_multi_attach( for ps in pageservers: ps.stop() - # Returning to a normal healthy state: all pageservers will start, but only the one most - # recently attached via the control plane will re-attach on startup + # Returning to a normal healthy state: all pageservers will start for ps in pageservers: ps.start() - with pytest.raises(PageserverApiException): - _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) - with pytest.raises(PageserverApiException): - _detail = http_clients[1].timeline_detail(tenant_id, timeline_id) - _detail = http_clients[2].timeline_detail(tenant_id, timeline_id) + # Pageservers are marked offline by the storage controller during the rolling restart + # above. This may trigger a reschedulling, so there's no guarantee that the tenant + # shard ends up attached to the most recent ps. + raised = 0 + serving_ps_idx = None + for idx, http_client in enumerate(http_clients): + try: + _detail = http_client.timeline_detail(tenant_id, timeline_id) + serving_ps_idx = idx + except PageserverApiException: + raised += 1 + + assert raised == 2 and serving_ps_idx is not None # All data we wrote while multi-attached remains readable - workload.validate(pageservers[2].id) + workload.validate(pageservers[serving_ps_idx].id) def test_upgrade_generationless_local_file_paths( diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 4ce53df214..68a45f957c 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -13,7 +13,10 @@ from fixtures.utils import wait_until # running. def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() + + # We inject a delay of 15 seconds for tenant activation below. + # Hence, bump the max delay here to not skip over the activation. + neon_env_builder.pageserver_config_override = 'background_task_maximum_delay="20s"' env = neon_env_builder.init_start() @@ -70,7 +73,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # pageserver does if a compute node connects and sends a request for the tenant # while it's still in Loading state. (It waits for the loading to finish, and then # processes the request.) - tenant_load_delay_ms = 5000 + tenant_load_delay_ms = 15000 env.pageserver.stop() env.pageserver.start( extra_env_vars={"FAILPOINTS": f"before-attaching-tenant=return({tenant_load_delay_ms})"} @@ -157,7 +160,6 @@ def test_pageserver_chaos( pytest.skip("times out in debug builds") neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() if shard_count is not None: neon_env_builder.num_pageservers = shard_count diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 58d61eab0d..53f69b5b26 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -122,7 +122,12 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, "scheduling": "Stop", }, ) - env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*") + env.storage_controller.allowed_errors.extend( + [ + ".*Scheduling is disabled by policy Stop.*", + ".*Skipping reconcile for policy Stop.*", + ] + ) # We use a fixed seed to make the test reproducible: we want a randomly # chosen order, but not to change the order every time we run the test. @@ -385,6 +390,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): # (reproduce https://github.com/neondatabase/neon/issues/6802) pageserver_b.http_client().tenant_delete(tenant_id) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): """ @@ -584,6 +592,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) workload.stop() + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index d5b5ac3f75..6f7ea0092a 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -138,7 +138,6 @@ def test_pg_regress( neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count, @@ -202,7 +201,6 @@ def test_isolation( if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count ) @@ -265,7 +263,6 @@ def test_sql_regress( if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count ) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 90c6e26d01..7f30b2d7a7 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -47,9 +47,6 @@ def test_sharding_smoke( # Use S3-compatible remote storage so that we can scrub: this test validates # that the scrubber doesn't barf when it sees a sharded tenant. neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() - - neon_env_builder.preserve_database_files = True env = neon_env_builder.init_start( initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size @@ -128,7 +125,6 @@ def test_sharding_smoke( # Check the scrubber isn't confused by sharded content, then disable # it during teardown because we'll have deleted by then env.storage_scrubber.scan_metadata() - neon_env_builder.scrub_on_exit = False env.storage_controller.pageserver_api().tenant_delete(tenant_id) assert_prefix_empty( @@ -200,14 +196,15 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", "compaction_period": "0s", - # create image layers eagerly, so that GC can remove some layers - "image_creation_threshold": 1, + # Disable automatic creation of image layers, as we will create them explicitly when we want them + "image_creation_threshold": 9999, "image_layer_creation_check_threshold": 0, } neon_env_builder.storage_controller_config = { # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. - "max_unavailable": "300s" + "max_offline": "30s", + "max_warming_up": "300s", } env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) @@ -226,7 +223,7 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: # Do a full image layer generation before splitting, so that when we compact after splitting # we should only see sizes decrease (from post-split drops/rewrites), not increase (from image layer generation) - env.get_tenant_pageserver(tenant_id).http_client().timeline_compact( + env.get_tenant_pageserver(tenant_id).http_client().timeline_checkpoint( tenant_id, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True ) @@ -372,9 +369,6 @@ def test_sharding_split_smoke( # Use S3-compatible remote storage so that we can scrub: this test validates # that the scrubber doesn't barf when it sees a sharded tenant. neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() - - neon_env_builder.preserve_database_files = True non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024} diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 741f16685e..da638ac233 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -12,6 +12,8 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + PageserverAvailability, + PageserverSchedulingPolicy, PgBin, StorageControllerApiException, TokenScope, @@ -918,6 +920,8 @@ def test_storage_controller_tenant_deletion( class Failure: pageserver_id: int + offline_timeout: int + must_detect_after: int def apply(self, env: NeonEnv): raise NotImplementedError() @@ -930,9 +934,11 @@ class Failure: class NodeStop(Failure): - def __init__(self, pageserver_ids, immediate): + def __init__(self, pageserver_ids, immediate, offline_timeout, must_detect_after): self.pageserver_ids = pageserver_ids self.immediate = immediate + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after def apply(self, env: NeonEnv): for ps_id in self.pageserver_ids: @@ -948,10 +954,42 @@ class NodeStop(Failure): return self.pageserver_ids +class NodeRestartWithSlowReattach(Failure): + def __init__(self, pageserver_id, offline_timeout, must_detect_after): + self.pageserver_id = pageserver_id + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after + self.thread = None + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.stop(immediate=False) + + def start_ps(): + pageserver.start( + extra_env_vars={"FAILPOINTS": "control-plane-client-re-attach=return(30000)"} + ) + + self.thread = threading.Thread(target=start_ps) + self.thread.start() + + def clear(self, env: NeonEnv): + if self.thread is not None: + self.thread.join() + + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints(("control-plane-client-re-attach", "off")) + + def nodes(self): + return [self.pageserver_id] + + class PageserverFailpoint(Failure): - def __init__(self, failpoint, pageserver_id): + def __init__(self, failpoint, pageserver_id, offline_timeout, must_detect_after): self.failpoint = failpoint self.pageserver_id = pageserver_id + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after def apply(self, env: NeonEnv): pageserver = env.get_pageserver(self.pageserver_id) @@ -987,15 +1025,28 @@ def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]: @pytest.mark.parametrize( "failure", [ - NodeStop(pageserver_ids=[1], immediate=False), - NodeStop(pageserver_ids=[1], immediate=True), - NodeStop(pageserver_ids=[1, 2], immediate=True), - PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"), + NodeStop(pageserver_ids=[1], immediate=False, offline_timeout=20, must_detect_after=5), + NodeStop(pageserver_ids=[1], immediate=True, offline_timeout=20, must_detect_after=5), + NodeStop(pageserver_ids=[1, 2], immediate=True, offline_timeout=20, must_detect_after=5), + PageserverFailpoint( + pageserver_id=1, + failpoint="get-utilization-http-handler", + offline_timeout=20, + must_detect_after=5, + ), + # Instrument a scenario where the node is slow to re-attach. The re-attach request itself + # should serve as a signal to the storage controller to use a more lenient heartbeat timeout. + NodeRestartWithSlowReattach(pageserver_id=1, offline_timeout=60, must_detect_after=15), ], ) def test_storage_controller_heartbeats( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure ): + neon_env_builder.storage_controller_config = { + "max_offline": "10s", + "max_warming_up": "20s", + } + neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_configs() env.start() @@ -1061,9 +1112,12 @@ def test_storage_controller_heartbeats( if node["id"] in offline_node_ids: assert node["availability"] == "Offline" - # A node is considered offline if the last successful heartbeat - # was more than 10 seconds ago (hardcoded in the storage controller). - wait_until(20, 1, nodes_offline) + start = time.time() + wait_until(failure.offline_timeout, 1, nodes_offline) + detected_after = time.time() - start + log.info(f"Detected node failures after {detected_after}s") + + assert detected_after >= failure.must_detect_after # .. expecting the tenant on the offline node to be migrated def tenant_migrated(): @@ -1546,7 +1600,13 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 ) - env.storage_controller.poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) shard_counts = get_node_shard_counts(env, tenant_ids) log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") @@ -1556,12 +1616,24 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): assert sum(shard_counts.values()) == total_shards ps.restart() - env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=10, backoff=1) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=10, + backoff=1, + ) env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 ) - env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=6, backoff=5) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=6, + backoff=5, + ) shard_counts = get_node_shard_counts(env, tenant_ids) log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") @@ -1606,11 +1678,23 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): backoff=2, ) - env.storage_controller.poll_node_status(ps_id_to_drain, "Draining", max_attempts=6, backoff=2) + env.storage_controller.poll_node_status( + ps_id_to_drain, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.DRAINING, + max_attempts=6, + backoff=2, + ) env.storage_controller.cancel_node_drain(ps_id_to_drain) - env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2) + env.storage_controller.poll_node_status( + ps_id_to_drain, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=6, + backoff=2, + ) @pytest.mark.parametrize("while_offline", [True, False]) @@ -1699,3 +1783,78 @@ def test_storage_controller_node_deletion( assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above. env.storage_controller.consistency_check() + + +def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): + """ + Test the `/control/v1/step_down` storage controller API. Upon receiving such + a request, the storage controller cancels any on-going reconciles and replies + with 503 to all requests apart from `/control/v1/step_down`, `/status` and `/metrics`. + """ + env = neon_env_builder.init_configs() + env.start() + + tid = TenantId.generate() + tsid = str(TenantShardId(tid, shard_number=0, shard_count=0)) + env.storage_controller.tenant_create(tid) + + env.storage_controller.reconcile_until_idle() + env.storage_controller.configure_failpoints(("sleep-on-reconcile-epilogue", "return(10000)")) + + # Make a change to the tenant config to trigger a slow reconcile + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None) + env.storage_controller.allowed_errors.append( + ".*Accepted configuration update but reconciliation failed.*" + ) + + observed_state = env.storage_controller.step_down() + log.info(f"Storage controller stepped down with {observed_state=}") + + # Validate that we waited for the slow reconcile to complete + # and updated the observed state in the storcon before stepping down. + node_id = str(env.pageserver.id) + assert tsid in observed_state + assert node_id in observed_state[tsid]["locations"] + assert "conf" in observed_state[tsid]["locations"][node_id] + assert "tenant_conf" in observed_state[tsid]["locations"][node_id]["conf"] + + tenant_conf = observed_state[tsid]["locations"][node_id]["conf"]["tenant_conf"] + assert "compaction_threshold" in tenant_conf + assert tenant_conf["compaction_threshold"] == 5 + + # Validate that we propagated the change to the pageserver + ps_tenant_conf = env.pageserver.http_client().tenant_config(tid) + assert "compaction_threshold" in ps_tenant_conf.effective_config + assert ps_tenant_conf.effective_config["compaction_threshold"] == 5 + + # Validate that the storcon is not replying to the usual requests + # once it has stepped down. + with pytest.raises(StorageControllerApiException, match="stepped_down"): + env.storage_controller.tenant_list() + + # Validate that we can step down multiple times and the observed state + # doesn't change. + observed_state_again = env.storage_controller.step_down() + assert observed_state == observed_state_again + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "leader"} + ) + == 0 + ) + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "stepped_down"} + ) + == 1 + ) + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "candidate"} + ) + == 0 + ) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 635690fc7f..a45430ca86 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -1,4 +1,5 @@ import os +import pprint import shutil import threading import time @@ -190,7 +191,9 @@ def test_scrubber_physical_gc_ancestors( "checkpoint_distance": f"{1024 * 1024}", "compaction_threshold": "1", "compaction_target_size": f"{1024 * 1024}", - "image_creation_threshold": "2", + # Disable automatic creation of image layers, as future image layers can result in layers in S3 that + # aren't referenced by children, earlier than the test expects such layers to exist + "image_creation_threshold": "9999", "image_layer_creation_check_threshold": "0", # Disable background compaction, we will do it explicitly "compaction_period": "0s", @@ -208,9 +211,17 @@ def test_scrubber_physical_gc_ancestors( new_shard_count = 4 assert shard_count is None or new_shard_count > shard_count shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) + env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately - # Make sure child shards have some layers - workload.write_rows(100) + # Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which + # compacts, and we only want to do tha explicitly later in the test. + workload.write_rows(100, upload=False) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + log.info(f"Waiting for shard {shard} on pageserver {ps.id}") + ps.http_client().timeline_checkpoint( + shard, timeline_id, compact=False, wait_until_uploaded=True + ) # Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once # a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even @@ -232,7 +243,7 @@ def test_scrubber_physical_gc_ancestors( workload.churn_rows(100) for shard in shards: ps = env.get_tenant_pageserver(shard) - ps.http_client().timeline_compact(shard, timeline_id) + ps.http_client().timeline_compact(shard, timeline_id, force_image_layer_creation=True) ps.http_client().timeline_gc(shard, timeline_id, 0) # We will use a min_age_secs=1 threshold for deletion, let it pass @@ -373,3 +384,76 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): assert gc_output["ancestor_layers_deleted"] > 0 assert gc_output["remote_storage_errors"] == 0 assert gc_output["controller_api_errors"] == 0 + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_scrubber_scan_pageserver_metadata( + neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] +): + """ + Create some layers. Delete an object listed in index. Run scrubber and see if it detects the defect. + """ + + # Use s3_storage so we could test out scrubber. + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1 + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + + # Create some layers. + + workload = Workload(env, env.initial_tenant, env.initial_timeline) + workload.init() + + for _ in range(3): + workload.write_rows(128) + + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + for _ in range(3): + workload.write_rows(128) + + # Get the latest index for a particular timeline. + + tenant_shard_id = TenantShardId(env.initial_tenant, 0, shard_count if shard_count else 0) + + assert isinstance(env.pageserver_remote_storage, S3Storage) + timeline_path = env.pageserver_remote_storage.timeline_path( + tenant_shard_id, env.initial_timeline + ) + + client = env.pageserver_remote_storage.client + bucket = env.pageserver_remote_storage.bucket_name + objects = client.list_objects_v2(Bucket=bucket, Prefix=f"{timeline_path}/", Delimiter="").get( + "Contents", [] + ) + keys = [obj["Key"] for obj in objects] + index_keys = list(filter(lambda s: s.startswith(f"{timeline_path}/index_part"), keys)) + assert len(index_keys) > 0 + + latest_index_key = env.pageserver_remote_storage.get_latest_index_key(index_keys) + log.info(f"{latest_index_key=}") + + index = env.pageserver_remote_storage.download_index_part(latest_index_key) + + assert len(index.layer_metadata) > 0 + it = iter(index.layer_metadata.items()) + + scan_summary = env.storage_scrubber.scan_metadata() + assert not scan_summary["with_warnings"] + assert not scan_summary["with_errors"] + + # Delete a layer file that is listed in the index. + layer, metadata = next(it) + log.info(f"Deleting {timeline_path}/{layer.to_str()}") + delete_response = client.delete_object( + Bucket=bucket, + Key=f"{timeline_path}/{layer.to_str()}-{metadata.generation:08x}", + ) + log.info(f"delete response: {delete_response}") + + # Check scan summary. Expect it to be a L0 layer so only emit warnings. + scan_summary = env.storage_scrubber.scan_metadata() + log.info(f"{pprint.pformat(scan_summary)}") + assert len(scan_summary["with_warnings"]) > 0 diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 6d20b3d0de..c343b349cf 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -315,6 +315,9 @@ def test_tenant_delete_races_timeline_creation( # Zero tenants remain (we deleted the default tenant) assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 7bf49a0874..840c7159ad 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -48,13 +48,12 @@ def test_threshold_based_eviction( tenant_id, timeline_id = env.initial_tenant, env.initial_timeline ps_http = env.pageserver.http_client() - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { - "kind": "NoEviction" - } + vps_http = env.storage_controller.pageserver_api() + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] is None - eviction_threshold = 5 - eviction_period = 1 - ps_http.set_tenant_config( + eviction_threshold = 10 + eviction_period = 2 + vps_http.set_tenant_config( tenant_id, { "eviction_policy": { @@ -64,7 +63,7 @@ def test_threshold_based_eviction( }, }, ) - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", "threshold": f"{eviction_threshold}s", "period": f"{eviction_period}s", @@ -73,7 +72,7 @@ def test_threshold_based_eviction( # restart because changing tenant config is not instant env.pageserver.restart() - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", "threshold": f"{eviction_threshold}s", "period": f"{eviction_period}s", @@ -81,7 +80,7 @@ def test_threshold_based_eviction( # create a bunch of L1s, only the least of which will need to be resident compaction_threshold = 3 # create L1 layers quickly - ps_http.patch_tenant_config_client_side( + vps_http.patch_tenant_config_client_side( tenant_id, inserts={ # Disable gc and compaction to avoid on-demand downloads from their side. @@ -154,7 +153,7 @@ def test_threshold_based_eviction( while time.time() - started_waiting_at < observation_window: current = ( time.time(), - MapInfoProjection(ps_http.layer_map_info(tenant_id, timeline_id)), + MapInfoProjection(vps_http.layer_map_info(tenant_id, timeline_id)), ) last = map_info_changes[-1] if map_info_changes else (0, None) if last[1] is None or current[1] != last[1]: diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index da37f469b3..6d96dda391 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -485,6 +485,9 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage), ) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + @pytest.mark.parametrize( "stuck_failpoint", @@ -703,6 +706,9 @@ def test_timeline_delete_works_for_remote_smoke( # Assume it is mock server inconsistency and check twice. wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage)) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_delete_orphaned_objects( neon_env_builder: NeonEnvBuilder, diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index ad73770c44..dbd0e6428b 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit ad73770c446ea361f43e4f0404798b7e5e7a62d8 +Subproject commit dbd0e6428b9274d72a10ac29bd3e3162faf109d4 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 4874c8e52e..035b73a9c5 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 4874c8e52ed349a9f8290bbdcd91eb92677a5d24 +Subproject commit 035b73a9c5998f9a0ef35cc8df1bae680bf770fc diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index b810fdfcbb..b39f316137 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2 +Subproject commit b39f316137fdd29e2da15d2af2fdd1cfd18163be diff --git a/vendor/revisions.json b/vendor/revisions.json index da49ff19c3..eeebd646f5 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "v16": ["16.3", "b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2"], - "v15": ["15.7", "4874c8e52ed349a9f8290bbdcd91eb92677a5d24"], - "v14": ["14.12", "ad73770c446ea361f43e4f0404798b7e5e7a62d8"] + "v16": ["16.3", "b39f316137fdd29e2da15d2af2fdd1cfd18163be"], + "v15": ["15.7", "035b73a9c5998f9a0ef35cc8df1bae680bf770fc"], + "v14": ["14.12", "dbd0e6428b9274d72a10ac29bd3e3162faf109d4"] }