diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..6ba6b3c887 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# allows for nicer hunk headers with git show +*.rs diff=rust diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 16759ad038..d4029bd37c 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -9,8 +9,8 @@ inputs: description: 'Region ID, if not set the project will be created in the default region' default: aws-us-east-2 postgres_version: - description: 'Postgres version; default is 15' - default: '15' + description: 'Postgres version; default is 16' + default: '16' api_host: description: 'Neon API host' default: console-stage.neon.build diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml new file mode 100644 index 0000000000..35c6251304 --- /dev/null +++ b/.github/workflows/_build-and-test-locally.yml @@ -0,0 +1,291 @@ +name: Build and Test Locally + +on: + workflow_call: + inputs: + arch: + description: 'x64 or arm64' + required: true + type: string + build-tag: + description: 'build tag' + required: true + type: string + build-tools-image: + description: 'build-tools image' + required: true + type: string + build-type: + description: 'debug or release' + required: true + type: string + +defaults: + run: + shell: bash -euxo pipefail {0} + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + +jobs: + build-neon: + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # Raise locked memory limit for tokio-epoll-uring. + # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), + # io_uring will account the memory of the CQ and SQ as locked. + # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + env: + BUILD_TYPE: ${{ inputs.build-type }} + GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} + BUILD_TAG: ${{ inputs.build-tag }} + + steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done + + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Set pg 14 revision for caching + id: pg_v14_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT + + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + + - name: Set pg 16 revision for caching + id: pg_v16_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + + # Set some environment variables used by all the steps. + # + # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. + # It also includes --features, if any + # + # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, + # because "cargo metadata" doesn't accept --release or --debug options + # + # We run tests with addtional features, that are turned off by default (e.g. in release builds), see + # corresponding Cargo.toml files for their descriptions. + - name: Set env variables + run: | + CARGO_FEATURES="--features testing" + if [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" + CARGO_FLAGS="--locked" + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix="" + CARGO_FLAGS="--locked --release" + fi + { + echo "cov_prefix=${cov_prefix}" + echo "CARGO_FEATURES=${CARGO_FEATURES}" + echo "CARGO_FLAGS=${CARGO_FLAGS}" + echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" + } >> $GITHUB_ENV + + - name: Cache postgres v14 build + id: cache_pg_14 + uses: actions/cache@v4 + with: + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v4 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Cache postgres v16 build + id: cache_pg_16 + uses: actions/cache@v4 + with: + path: pg_install/v16 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: mold -run make postgres-v14 -j$(nproc) + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: mold -run make postgres-v15 -j$(nproc) + + - name: Build postgres v16 + if: steps.cache_pg_16.outputs.cache-hit != 'true' + run: mold -run make postgres-v16 -j$(nproc) + + - name: Build neon extensions + run: mold -run make neon-pg-ext -j$(nproc) + + - name: Build walproposer-lib + run: mold -run make walproposer-lib -j$(nproc) + + - name: Run cargo build + run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests + + # Do install *before* running rust tests because they might recompile the + # binaries with different features/flags. + - name: Install rust binaries + run: | + # Install target binaries + mkdir -p /tmp/neon/bin/ + binaries=$( + ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | + jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' + ) + for bin in $binaries; do + SRC=target/$BUILD_TYPE/$bin + DST=/tmp/neon/bin/$bin + cp "$SRC" "$DST" + done + + # Install test executables and write list of all binaries (for code coverage) + if [[ $BUILD_TYPE == "debug" ]]; then + # Keep bloated coverage data files away from the rest of the artifact + mkdir -p /tmp/coverage/ + + mkdir -p /tmp/neon/test_bin/ + + test_exe_paths=$( + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | + jq -r '.executable | select(. != null)' + ) + for bin in $test_exe_paths; do + SRC=$bin + DST=/tmp/neon/test_bin/$(basename $bin) + + # We don't need debug symbols for code coverage, so strip them out to make + # the artifact smaller. + strip "$SRC" -o "$DST" + echo "$DST" >> /tmp/coverage/binaries.list + done + + for bin in $binaries; do + echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list + done + fi + + - name: Run rust tests + env: + NEXTEST_RETRIES: 3 + run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR + LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib + export LD_LIBRARY_PATH + + #nextest does not yet support running doctests + cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + + for io_engine in std-fs tokio-epoll-uring ; do + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES + done + + # Run separate tests for real S3 + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests + export REMOTE_STORAGE_S3_REGION=eu-central-1 + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)' + + # Run separate tests for real Azure Blob Storage + # XXX: replace region with `eu-central-1`-like region + export ENABLE_REAL_AZURE_REMOTE_STORAGE=y + export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" + export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" + export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" + export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)' + + - name: Install postgres binaries + run: cp -a pg_install /tmp/neon/pg_install + + - name: Upload Neon artifact + uses: ./.github/actions/upload + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact + path: /tmp/neon + + # XXX: keep this after the binaries.list is formed, so the coverage can properly work later + - name: Merge and upload coverage data + if: inputs.build-type == 'debug' + uses: ./.github/actions/save-coverage-data + + regress-tests: + # Run test on x64 only + if: inputs.arch == 'x64' + needs: [ build-neon ] + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # for changed limits, see comments on `options:` earlier in this file + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + strategy: + fail-fast: false + matrix: + pg_version: [ v14, v15, v16 ] + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Pytest regression tests + uses: ./.github/actions/run-python-test-set + timeout-minutes: 60 + with: + build_type: ${{ inputs.build-type }} + test_selection: regress + needs_postgres_source: true + run_with_real_s3: true + real_s3_bucket: neon-github-ci-tests + real_s3_region: eu-central-1 + rerun_flaky: true + pg_version: ${{ matrix.pg_version }} + env: + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + CHECK_ONDISK_DATA_COMPATIBILITY: nonempty + BUILD_TAG: ${{ inputs.build-tag }} + PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + PAGESERVER_GET_VECTORED_IMPL: vectored + PAGESERVER_GET_IMPL: vectored + PAGESERVER_VALIDATE_VEC_GET: true + + # Temporary disable this step until we figure out why it's so flaky + # Ref https://github.com/neondatabase/neon/issues/4540 + - name: Merge and upload coverage data + if: | + false && + inputs.build-type == 'debug' && matrix.pg_version == 'v14' + uses: ./.github/actions/save-coverage-data diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 899cae2b86..c132b5b513 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -56,15 +56,27 @@ concurrency: jobs: bench: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + strategy: + fail-fast: false + matrix: + include: + - DEFAULT_PG_VERSION: 16 + PLATFORM: "neon-staging" + region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + provisioner: 'k8s-pod' + - DEFAULT_PG_VERSION: 16 + PLATFORM: "azure-staging" + region_id: 'azure-eastus2' + provisioner: 'k8s-neonvm' env: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} - PLATFORM: "neon-staging" + PLATFORM: ${{ matrix.PLATFORM }} runs-on: [ self-hosted, us-east-2, x64 ] container: @@ -85,9 +97,10 @@ jobs: id: create-neon-project uses: ./.github/actions/neon-project-create with: - region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + region_id: ${{ matrix.region_id }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} + provisioner: ${{ matrix.provisioner }} - name: Run benchmark uses: ./.github/actions/run-python-test-set @@ -96,13 +109,14 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} + pg_version: ${{ env.DEFAULT_PG_VERSION }} # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests extra_params: -m remote_cluster --sparse-ordering - --timeout 5400 + --timeout 14400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py --ignore test_runner/performance/test_logical_replication.py @@ -133,6 +147,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} replication-tests: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 14 @@ -177,6 +192,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -202,11 +218,14 @@ jobs: # Available platforms: # - neon-captest-new: Freshly created project (1 CU) # - neon-captest-freetier: Use freetier-sized compute (0.25 CU) + # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region + # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region # - neon-captest-reuse: Reusing existing project # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage env: RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }} + DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} runs-on: ubuntu-22.04 outputs: pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }} @@ -217,23 +236,33 @@ jobs: - name: Generate matrix for pgbench benchmark id: pgbench-compare-matrix run: | + region_id_default=${{ env.DEFAULT_REGION_ID }} matrix='{ + "pg_version" : [ + 16 + ], + "region_id" : [ + "'"$region_id_default"'" + ], "platform": [ "neon-captest-new", "neon-captest-reuse", "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, - { "platform": "neon-captest-new", "db_size": "50gb" }, - { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, - { "platform": "neonvm-captest-new", "db_size": "50gb" }, - { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] + "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] }' if [ "$(date +%A)" = "Saturday" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, - { "platform": "rds-aurora", "db_size": "50gb"}]') + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}, + { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "50gb"}]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -285,7 +314,7 @@ jobs: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -310,14 +339,14 @@ jobs: prefix: latest - name: Create Neon Project - if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform) + if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: - region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + region_id: ${{ matrix.region_id }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }} + compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }} provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }} - name: Set up Connection String @@ -330,7 +359,7 @@ jobs: neonvm-captest-sharding-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} ;; - neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) + neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -355,6 +384,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -368,6 +398,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -381,6 +412,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -407,6 +439,13 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} pgbench-pgvector: + strategy: + fail-fast: false + matrix: + include: + - PLATFORM: "neon-captest-pgvector" + - PLATFORM: "azure-captest-pgvector" + env: TEST_PG_BENCH_DURATIONS_MATRIX: "15m" TEST_PG_BENCH_SCALES_MATRIX: "1" @@ -414,8 +453,9 @@ jobs: DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote + LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} - PLATFORM: "neon-captest-pgvector" + PLATFORM: ${{ matrix.PLATFORM }} runs-on: [ self-hosted, us-east-2, x64 ] container: @@ -425,17 +465,39 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Download Neon artifact - uses: ./.github/actions/download - with: - name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact - path: /tmp/neon/ - prefix: latest + # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16 + # instead of using Neon artifacts containing pgbench + - name: Install postgresql-16 where pytest expects it + run: | + cd /home/nonroot + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb + dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg + mkdir -p /tmp/neon/pg_install/v16/bin + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql + ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib + /tmp/neon/pg_install/v16/bin/pgbench --version + /tmp/neon/pg_install/v16/bin/psql --version - name: Set up Connection String id: set-up-connstr run: | - CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} + case "${PLATFORM}" in + neon-captest-pgvector) + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} + ;; + azure-captest-pgvector) + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }} + ;; + *) + echo >&2 "Unknown PLATFORM=${PLATFORM}" + exit 1 + ;; + esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT @@ -447,6 +509,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -460,6 +523,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -474,11 +538,10 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - clickbench-compare: # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters # we use for performance testing in pgbench-compare. @@ -722,6 +785,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_user_examples + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index cb7655e039..d4af174fc5 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -125,7 +125,11 @@ jobs: check-codestyle-rust: needs: [ check-permissions, build-build-tools-image ] - runs-on: [ self-hosted, gen3, small ] + strategy: + matrix: + arch: [ x64, arm64 ] + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -193,291 +197,27 @@ jobs: if: ${{ !cancelled() }} run: cargo deny check --hide-inclusion-graph - build-neon: - needs: [ check-permissions, tag, build-build-tools-image ] - runs-on: [ self-hosted, gen3, large ] - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - # Raise locked memory limit for tokio-epoll-uring. - # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), - # io_uring will account the memory of the CQ and SQ as locked. - # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 - options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + build-and-test-locally: + needs: [ tag, build-build-tools-image ] strategy: fail-fast: false matrix: - build_type: [ debug, release ] - env: - BUILD_TYPE: ${{ matrix.build_type }} - GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG: ${{ needs.tag.outputs.build-tag }} - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - - name: Set pg 14 revision for caching - id: pg_v14_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - - - name: Set pg 15 revision for caching - id: pg_v15_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - - - name: Set pg 16 revision for caching - id: pg_v16_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT - - # Set some environment variables used by all the steps. - # - # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. - # It also includes --features, if any - # - # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, - # because "cargo metadata" doesn't accept --release or --debug options - # - # We run tests with addtional features, that are turned off by default (e.g. in release builds), see - # corresponding Cargo.toml files for their descriptions. - - name: Set env variables - run: | - CARGO_FEATURES="--features testing" - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FLAGS="--locked" - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix="" - CARGO_FLAGS="--locked --release" - fi - { - echo "cov_prefix=${cov_prefix}" - echo "CARGO_FEATURES=${CARGO_FEATURES}" - echo "CARGO_FLAGS=${CARGO_FLAGS}" - echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" - } >> $GITHUB_ENV - - # Disabled for now - # Don't include the ~/.cargo/registry/src directory. It contains just - # uncompressed versions of the crates in ~/.cargo/registry/cache - # directory, and it's faster to let 'cargo' to rebuild it from the - # compressed crates. -# - name: Cache cargo deps -# id: cache_cargo -# uses: actions/cache@v4 -# with: -# path: | -# ~/.cargo/registry/ -# !~/.cargo/registry/src -# ~/.cargo/git/ -# target/ -# # Fall back to older versions of the key, if no cache for current Cargo.lock was found -# key: | -# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} -# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}- - - - name: Cache postgres v14 build - id: cache_pg_14 - uses: actions/cache@v4 - with: - path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - - - name: Cache postgres v15 build - id: cache_pg_15 - uses: actions/cache@v4 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - - - name: Cache postgres v16 build - id: cache_pg_16 - uses: actions/cache@v4 - with: - path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make postgres-v14 -j$(nproc) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make postgres-v15 -j$(nproc) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make postgres-v16 -j$(nproc) - - - name: Build neon extensions - run: mold -run make neon-pg-ext -j$(nproc) - - - name: Build walproposer-lib - run: mold -run make walproposer-lib -j$(nproc) - - - name: Run cargo build - run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR - ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - - # Do install *before* running rust tests because they might recompile the - # binaries with different features/flags. - - name: Install rust binaries - run: | - # Install target binaries - mkdir -p /tmp/neon/bin/ - binaries=$( - ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | - jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' - ) - for bin in $binaries; do - SRC=target/$BUILD_TYPE/$bin - DST=/tmp/neon/bin/$bin - cp "$SRC" "$DST" - done - - # Install test executables and write list of all binaries (for code coverage) - if [[ $BUILD_TYPE == "debug" ]]; then - # Keep bloated coverage data files away from the rest of the artifact - mkdir -p /tmp/coverage/ - - mkdir -p /tmp/neon/test_bin/ - - test_exe_paths=$( - ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | - jq -r '.executable | select(. != null)' - ) - for bin in $test_exe_paths; do - SRC=$bin - DST=/tmp/neon/test_bin/$(basename $bin) - - # We don't need debug symbols for code coverage, so strip them out to make - # the artifact smaller. - strip "$SRC" -o "$DST" - echo "$DST" >> /tmp/coverage/binaries.list - done - - for bin in $binaries; do - echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list - done - fi - - - name: Run rust tests - env: - NEXTEST_RETRIES: 3 - run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR - LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib - export LD_LIBRARY_PATH - - #nextest does not yet support running doctests - cargo test --doc $CARGO_FLAGS $CARGO_FEATURES - - for io_engine in std-fs tokio-epoll-uring ; do - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES - done - - # Run separate tests for real S3 - export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests - export REMOTE_STORAGE_S3_REGION=eu-central-1 - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)' - - # Run separate tests for real Azure Blob Storage - # XXX: replace region with `eu-central-1`-like region - export ENABLE_REAL_AZURE_REMOTE_STORAGE=y - export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" - export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" - export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" - export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)' - - - name: Install postgres binaries - run: cp -a pg_install /tmp/neon/pg_install - - - name: Upload Neon artifact - uses: ./.github/actions/upload - with: - name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact - path: /tmp/neon - - # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - - name: Merge and upload coverage data - if: matrix.build_type == 'debug' - uses: ./.github/actions/save-coverage-data - - regress-tests: - needs: [ check-permissions, build-neon, build-build-tools-image, tag ] - runs-on: [ self-hosted, gen3, large ] - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - # for changed limits, see comments on `options:` earlier in this file - options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - pg_version: [ v14, v15, v16 ] - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - - name: Pytest regression tests - uses: ./.github/actions/run-python-test-set - timeout-minutes: 60 - with: - build_type: ${{ matrix.build_type }} - test_selection: regress - needs_postgres_source: true - run_with_real_s3: true - real_s3_bucket: neon-github-ci-tests - real_s3_region: eu-central-1 - rerun_flaky: true - pg_version: ${{ matrix.pg_version }} - env: - TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - CHECK_ONDISK_DATA_COMPATIBILITY: nonempty - BUILD_TAG: ${{ needs.tag.outputs.build-tag }} - PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_IMPL: vectored - PAGESERVER_GET_IMPL: vectored - PAGESERVER_VALIDATE_VEC_GET: true - - # Temporary disable this step until we figure out why it's so flaky - # Ref https://github.com/neondatabase/neon/issues/4540 - - name: Merge and upload coverage data - if: | - false && - matrix.build_type == 'debug' && matrix.pg_version == 'v14' - uses: ./.github/actions/save-coverage-data + arch: [ x64 ] + build-type: [ debug, release ] + include: + - build-type: release + arch: arm64 + uses: ./.github/workflows/_build-and-test-locally.yml + with: + arch: ${{ matrix.arch }} + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }} + build-tag: ${{ needs.tag.outputs.build-tag }} + build-type: ${{ matrix.build-type }} + secrets: inherit + # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking get-benchmarks-durations: + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') outputs: json: ${{ steps.get-benchmark-durations.outputs.json }} needs: [ check-permissions, build-build-tools-image ] @@ -488,7 +228,6 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init - if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') steps: - name: Checkout uses: actions/checkout@v4 @@ -513,7 +252,8 @@ jobs: echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT benchmarks: - needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ] + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') + needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] runs-on: [ self-hosted, gen3, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} @@ -522,7 +262,6 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} # for changed limits, see comments on `options:` earlier in this file options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 - if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: fail-fast: false matrix: @@ -570,7 +309,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} create-test-report: - needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ] + needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} outputs: report-url: ${{ steps.create-allure-report.outputs.report-url }} @@ -621,7 +360,7 @@ jobs: }) coverage-report: - needs: [ check-permissions, regress-tests, build-build-tools-image ] + needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] runs-on: [ self-hosted, gen3, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} @@ -772,7 +511,8 @@ jobs: pull: true file: Dockerfile cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }} + # 23.07.2024 temporarily disable cache saving in the registry as it is very slow + # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }} tags: | neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} @@ -865,7 +605,8 @@ jobs: pull: true file: Dockerfile.compute-node cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} + # 23.07.2024 temporarily disable cache saving in the registry as it is very slow + # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} tags: | neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} @@ -885,7 +626,8 @@ jobs: file: Dockerfile.compute-node target: neon-pg-ext-test cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} + # 23.07.2024 temporarily disable cache saving in the registry as it is very slow + # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} tags: | neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} @@ -1223,7 +965,7 @@ jobs: exit 1 deploy: - needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ] + needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' runs-on: [ self-hosted, gen3, small ] @@ -1324,7 +1066,7 @@ jobs: }) promote-compatibility-data: - needs: [ check-permissions, promote-images, tag, regress-tests ] + needs: [ check-permissions, promote-images, tag, build-and-test-locally ] if: github.ref_name == 'release' runs-on: [ self-hosted, gen3, small ] @@ -1363,7 +1105,7 @@ jobs: done pin-build-tools-image: - needs: [ build-build-tools-image, promote-images, regress-tests ] + needs: [ build-build-tools-image, promote-images, build-and-test-locally ] if: github.ref_name == 'main' uses: ./.github/workflows/pin-build-tools-image.yml with: @@ -1385,7 +1127,7 @@ jobs: needs: - check-codestyle-python - check-codestyle-rust - - regress-tests + - build-and-test-locally - test-images runs-on: ubuntu-22.04 steps: diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 11ff634b6c..d4870e16ad 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -133,221 +133,6 @@ jobs: - name: Check that no warnings are produced run: ./run_clippy.sh - check-linux-arm-build: - needs: [ check-permissions, build-build-tools-image ] - timeout-minutes: 90 - runs-on: [ self-hosted, small-arm64 ] - - env: - # Use release build only, to have less debug info around - # Hence keeping target/ (and general cache size) smaller - BUILD_TYPE: release - CARGO_FEATURES: --features testing - CARGO_FLAGS: --release - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} - - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - options: --init - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - - name: Set pg 14 revision for caching - id: pg_v14_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - - - name: Set pg 15 revision for caching - id: pg_v15_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - - - name: Set pg 16 revision for caching - id: pg_v16_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT - - - name: Set env variables - run: | - echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV - - - name: Cache postgres v14 build - id: cache_pg_14 - uses: actions/cache@v4 - with: - path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v15 build - id: cache_pg_15 - uses: actions/cache@v4 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v16 build - id: cache_pg_16 - uses: actions/cache@v4 - with: - path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make postgres-v14 -j$(nproc) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make postgres-v15 -j$(nproc) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make postgres-v16 -j$(nproc) - - - name: Build neon extensions - run: mold -run make neon-pg-ext -j$(nproc) - - - name: Build walproposer-lib - run: mold -run make walproposer-lib -j$(nproc) - - - name: Run cargo build - run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR - mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc) - - - name: Run cargo test - env: - NEXTEST_RETRIES: 3 - run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR - LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib - export LD_LIBRARY_PATH - - cargo nextest run $CARGO_FEATURES -j$(nproc) - - # Run separate tests for real S3 - export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests - export REMOTE_STORAGE_S3_REGION=eu-central-1 - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc) - - # Run separate tests for real Azure Blob Storage - # XXX: replace region with `eu-central-1`-like region - export ENABLE_REAL_AZURE_REMOTE_STORAGE=y - export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" - export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" - export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" - export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_azure -j$(nproc) - - check-codestyle-rust-arm: - needs: [ check-permissions, build-build-tools-image ] - timeout-minutes: 90 - runs-on: [ self-hosted, small-arm64 ] - - container: - image: ${{ needs.build-build-tools-image.outputs.image }} - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - options: --init - - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - # Some of our rust modules use FFI and need those to be checked - - name: Get postgres headers - run: make postgres-headers -j$(nproc) - - # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. - # This will catch compiler & clippy warnings in all feature combinations. - # TODO: use cargo hack for build and test as well, but, that's quite expensive. - # NB: keep clippy args in sync with ./run_clippy.sh - - run: | - CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" - if [ "$CLIPPY_COMMON_ARGS" = "" ]; then - echo "No clippy args found in .neon_clippy_args" - exit 1 - fi - echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV - - - name: Run cargo clippy (debug) - if: matrix.build_type == 'debug' - run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS - - name: Run cargo clippy (release) - if: matrix.build_type == 'release' - run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS - - - name: Check documentation generation - if: matrix.build_type == 'release' - run: cargo doc --workspace --no-deps --document-private-items -j$(nproc) - env: - RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" - - # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - - name: Check formatting - if: ${{ !cancelled() && matrix.build_type == 'release' }} - run: cargo fmt --all -- --check - - # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - - name: Check rust dependencies - if: ${{ !cancelled() && matrix.build_type == 'release' }} - run: | - cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date - cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - - # https://github.com/EmbarkStudios/cargo-deny - - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() && matrix.build_type == 'release' }} - run: cargo deny check - gather-rust-build-stats: needs: [ check-permissions, build-build-tools-image ] if: | diff --git a/Cargo.lock b/Cargo.lock index 3541f7a928..04adccba7f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -261,15 +261,6 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba" -[[package]] -name = "atomic-polyfill" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289" -dependencies = [ - "critical-section", -] - [[package]] name = "atomic-take" version = "1.1.0" @@ -1236,6 +1227,7 @@ dependencies = [ "regex", "remote_storage", "reqwest 0.12.4", + "rlimit", "rust-ini", "serde", "serde_json", @@ -1367,6 +1359,7 @@ dependencies = [ "tracing", "url", "utils", + "whoami", "workspace_hack", ] @@ -1449,12 +1442,6 @@ dependencies = [ "itertools", ] -[[package]] -name = "critical-section" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52" - [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -2027,16 +2014,6 @@ dependencies = [ "tokio-util", ] -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "fsevent-sys" version = "4.1.0" @@ -2290,15 +2267,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "hash32" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606" -dependencies = [ - "byteorder", -] - [[package]] name = "hashbrown" version = "0.12.3" @@ -2347,18 +2315,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "heapless" -version = "0.8.0" -source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963be2de24804acf5e5001#644653bf3b831c6bb4963be2de24804acf5e5001" -dependencies = [ - "atomic-polyfill", - "hash32", - "rustc_version", - "spin 0.9.8", - "stable_deref_trait", -] - [[package]] name = "heck" version = "0.4.1" @@ -2392,16 +2348,6 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" -[[package]] -name = "histogram" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b" -dependencies = [ - "serde", - "thiserror", -] - [[package]] name = "hmac" version = "0.12.1" @@ -3242,16 +3188,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "nu-ansi-term" -version = "0.46.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" -dependencies = [ - "overload", - "winapi", -] - [[package]] name = "num" version = "0.4.1" @@ -3547,12 +3483,6 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" -[[package]] -name = "overload" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" - [[package]] name = "p256" version = "0.11.1" @@ -4413,6 +4343,7 @@ dependencies = [ "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", + "typed-json", "url", "urlencoding", "utils", @@ -4611,6 +4542,15 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "regex" version = "1.10.2" @@ -4672,6 +4612,7 @@ name = "remote_storage" version = "0.1.0" dependencies = [ "anyhow", + "async-stream", "async-trait", "aws-config", "aws-credential-types", @@ -4901,6 +4842,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "rlimit" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8" +dependencies = [ + "libc", +] + [[package]] name = "routerify" version = "3.0.0" @@ -5169,7 +5119,6 @@ dependencies = [ "crc32c", "desim", "fail", - "fs2", "futures", "git-version", "hex", @@ -5196,6 +5145,8 @@ dependencies = [ "sha2", "signal-hook", "storage_broker", + "strum", + "strum_macros", "thiserror", "tokio", "tokio-io-timeout", @@ -5704,9 +5655,6 @@ name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -dependencies = [ - "lock_api", -] [[package]] name = "spki" @@ -5728,12 +5676,6 @@ dependencies = [ "der 0.7.8", ] -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - [[package]] name = "static_assertions" version = "1.1.0" @@ -5810,6 +5752,28 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "storage_controller_client" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bytes", + "futures", + "pageserver_api", + "pageserver_client", + "postgres", + "reqwest 0.12.4", + "serde", + "thiserror", + "tokio", + "tokio-postgres", + "tokio-stream", + "tokio-util", + "utils", + "workspace_hack", +] + [[package]] name = "storage_scrubber" version = "0.1.0" @@ -5829,7 +5793,6 @@ dependencies = [ "futures", "futures-util", "hex", - "histogram", "humantime", "itertools", "once_cell", @@ -5844,6 +5807,7 @@ dependencies = [ "serde", "serde_json", "serde_with", + "storage_controller_client", "thiserror", "tokio", "tokio-postgres", @@ -5873,6 +5837,7 @@ dependencies = [ "reqwest 0.12.4", "serde", "serde_json", + "storage_controller_client", "thiserror", "tokio", "tracing", @@ -6500,17 +6465,6 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" -[[package]] -name = "trace" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap", - "pageserver_api", - "utils", - "workspace_hack", -] - [[package]] name = "tracing" version = "0.1.37" @@ -6610,7 +6564,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", - "nu-ansi-term", "once_cell", "regex", "serde", @@ -6675,6 +6628,16 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "typed-json" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "typenum" version = "1.16.0" @@ -6809,7 +6772,6 @@ dependencies = [ "criterion", "fail", "futures", - "heapless", "hex", "hex-literal", "humantime", @@ -6971,6 +6933,12 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + [[package]] name = "wasm-bindgen" version = "0.2.92" @@ -7123,6 +7091,17 @@ dependencies = [ "once_cell", ] +[[package]] +name = "whoami" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9" +dependencies = [ + "redox_syscall 0.4.1", + "wasite", + "web-sys", +] + [[package]] name = "winapi" version = "0.3.9" diff --git a/Cargo.toml b/Cargo.toml index fc3dd51809..7749378114 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,9 +13,9 @@ members = [ "safekeeper", "storage_broker", "storage_controller", + "storage_controller/client", "storage_scrubber", "workspace_hack", - "trace", "libs/compute_api", "libs/pageserver_api", "libs/postgres_ffi", @@ -84,7 +84,6 @@ enumset = "1.0.12" fail = "0.5.0" fallible-iterator = "0.2" framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" } -fs2 = "0.4.3" futures = "0.3" futures-core = "0.3" futures-util = "0.3" @@ -184,14 +183,16 @@ tower-service = "0.3.2" tracing = "0.1" tracing-error = "0.2.0" tracing-opentelemetry = "0.21.0" -tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] } +tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } twox-hash = { version = "1.6.3", default-features = false } +typed-json = "0.1" url = "2.2" urlencoding = "2.1" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" rustls-native-certs = "0.7" x509-parser = "0.15" +whoami = "1.5.1" ## TODO replace this with tracing env_logger = "0.10" @@ -203,9 +204,6 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -## Other git libraries -heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending - ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } @@ -221,6 +219,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } desim = { version = "0.1", path = "./libs/desim" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. +storage_controller_client = { path = "./storage_controller/client" } tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" } tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" } utils = { version = "0.1", path = "./libs/utils/" } diff --git a/Dockerfile b/Dockerfile index a41598ef72..ace112cccf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -93,13 +93,14 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. -RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ - && /usr/local/bin/pageserver -D /data/.neon/ --init \ - -c "id=1234" \ - -c "broker_endpoint='http://storage_broker:50051'" \ - -c "pg_distrib_dir='/usr/local/'" \ - -c "listen_pg_addr='0.0.0.0:6400'" \ - -c "listen_http_addr='0.0.0.0:9898'" +RUN mkdir -p /data/.neon/ && \ + echo "id=1234" > "/data/.neon/identity.toml" && \ + echo "broker_endpoint='http://storage_broker:50051'\n" \ + "pg_distrib_dir='/usr/local/'\n" \ + "listen_pg_addr='0.0.0.0:6400'\n" \ + "listen_http_addr='0.0.0.0:9898'\n" \ + > /data/.neon/pageserver.toml && \ + chown -R neon:neon /data/.neon # When running a binary that links with libpq, default to using our most recent postgres version. Binaries # that want a particular postgres version will select it explicitly: this is just a default. @@ -110,3 +111,6 @@ VOLUME ["/data"] USER neon EXPOSE 6400 EXPOSE 9898 + +CMD /usr/local/bin/pageserver -D /data/.neon + diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 7ab685625a..48a52bfc6d 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz FROM build-deps AS rum-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY patches/rum.patch /rum.patch + RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ + patch -p1 < /rum.patch && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control diff --git a/Makefile b/Makefile index 942867d81a..de298303e3 100644 --- a/Makefile +++ b/Makefile @@ -69,6 +69,8 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 # Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel) CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib +CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55" + # # Top level Makefile to build Neon and PostgreSQL # @@ -79,15 +81,24 @@ all: neon postgres neon-pg-ext # # The 'postgres_ffi' depends on the Postgres headers. .PHONY: neon -neon: postgres-headers walproposer-lib +neon: postgres-headers walproposer-lib cargo-target-dir +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) +.PHONY: cargo-target-dir +cargo-target-dir: + # https://github.com/rust-lang/cargo/issues/14281 + mkdir -p target + test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG ### PostgreSQL parts # Some rules are duplicated for Postgres v14 and 15. We may want to refactor # to avoid the duplication in the future, but it's tolerable for now. # $(POSTGRES_INSTALL_DIR)/build/%/config.status: + + mkdir -p $(POSTGRES_INSTALL_DIR) + test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG + +@echo "Configuring Postgres $* build" @test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \ echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8f96530a9d..8ceb8f2ad2 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -44,3 +44,4 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" } zstd = "0.13" bytes = "1.0" rust-ini = "0.20.0" +rlimit = "0.10.1" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 7bf5db5a57..0ba2c1aeb4 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -6,7 +6,7 @@ //! - Every start is a fresh start, so the data directory is removed and //! initialized again on each run. //! - If remote_extension_config is provided, it will be used to fetch extensions list -//! and download `shared_preload_libraries` from the remote storage. +//! and download `shared_preload_libraries` from the remote storage. //! - Next it will put configuration files into the `PGDATA` directory. //! - Sync safekeepers and get commit LSN. //! - Get `basebackup` from pageserver using the returned on the previous step LSN. @@ -33,7 +33,6 @@ //! -b /usr/local/bin/postgres \ //! -r http://pg-ext-s3-gateway \ //! ``` -//! use std::collections::HashMap; use std::fs::File; use std::path::Path; @@ -64,6 +63,7 @@ use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; use compute_tools::swap::resize_swap; +use rlimit::{setrlimit, Resource}; // this is an arbitrary build tag. Fine as a default / for testing purposes // in-case of not-set environment var @@ -72,6 +72,9 @@ const BUILD_TAG_DEFAULT: &str = "latest"; fn main() -> Result<()> { let (build_tag, clap_args) = init()?; + // enable core dumping for all child processes + setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; + let (pg_handle, start_pg_result) = { // Enter startup tracing context let _startup_context_guard = startup_context_from_env(); diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index eced6fc0b2..91855d954d 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -56,6 +56,7 @@ pub struct ComputeNode { /// - we push new spec and it does reconfiguration /// - but then something happens and compute pod / VM is destroyed, /// so k8s controller starts it again with the **old** spec + /// /// and the same for empty computes: /// - we started compute without any spec /// - we push spec and it does configuration @@ -1116,7 +1117,7 @@ impl ComputeNode { // EKS worker nodes have following core dump settings: // /proc/sys/kernel/core_pattern -> core // /proc/sys/kernel/core_uses_pid -> 1 - // ulimint -c -> unlimited + // ulimit -c -> unlimited // which results in core dumps being written to postgres data directory as core.. // // Use that as a default location and pattern, except macos where core dumps are written diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 61dcf01c84..22ab145eda 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -9,6 +9,9 @@ pub(crate) struct MigrationRunner<'m> { impl<'m> MigrationRunner<'m> { pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self { + // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64 + assert!(migrations.len() + 1 < i64::MAX as usize); + Self { client, migrations } } @@ -22,11 +25,8 @@ impl<'m> MigrationRunner<'m> { Ok(row.get::<&str, i64>("id")) } - fn update_migration_id(&mut self) -> Result<()> { - let setval = format!( - "UPDATE neon_migration.migration_id SET id={}", - self.migrations.len() - ); + fn update_migration_id(&mut self, migration_id: i64) -> Result<()> { + let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id); self.client .simple_query(&setval) @@ -57,44 +57,49 @@ impl<'m> MigrationRunner<'m> { pub fn run_migrations(mut self) -> Result<()> { self.prepare_migrations()?; - let mut current_migration: usize = self.get_migration_id()? as usize; - let starting_migration_id = current_migration; - - let query = "BEGIN"; - self.client - .simple_query(query) - .context("run_migrations begin")?; - + let mut current_migration = self.get_migration_id()? as usize; while current_migration < self.migrations.len() { + macro_rules! migration_id { + ($cm:expr) => { + ($cm + 1) as i64 + }; + } + let migration = self.migrations[current_migration]; if migration.starts_with("-- SKIP") { - info!("Skipping migration id={}", current_migration); + info!("Skipping migration id={}", migration_id!(current_migration)); } else { info!( "Running migration id={}:\n{}\n", - current_migration, migration + migration_id!(current_migration), + migration ); + + self.client + .simple_query("BEGIN") + .context("begin migration")?; + self.client.simple_query(migration).with_context(|| { - format!("run_migration current_migration={}", current_migration) + format!( + "run_migrations migration id={}", + migration_id!(current_migration) + ) })?; + + // Migration IDs start at 1 + self.update_migration_id(migration_id!(current_migration))?; + + self.client + .simple_query("COMMIT") + .context("commit migration")?; + + info!("Finished migration id={}", migration_id!(current_migration)); } current_migration += 1; } - self.update_migration_id()?; - - let query = "COMMIT"; - self.client - .simple_query(query) - .context("run_migrations commit")?; - - info!( - "Ran {} migrations", - (self.migrations.len() - starting_migration_id) - ); - Ok(()) } } diff --git a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql similarity index 100% rename from compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql rename to compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql diff --git a/compute_tools/src/migrations/0001-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql similarity index 100% rename from compute_tools/src/migrations/0001-alter_roles.sql rename to compute_tools/src/migrations/0002-alter_roles.sql diff --git a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql rename to compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql rename to compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql rename to compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql similarity index 100% rename from compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql rename to compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql similarity index 100% rename from compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql rename to compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql diff --git a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql similarity index 100% rename from compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql rename to compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql diff --git a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql similarity index 100% rename from compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql rename to compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql diff --git a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql new file mode 100644 index 0000000000..28750e00dd --- /dev/null +++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql @@ -0,0 +1,7 @@ +DO $$ +BEGIN + IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser'; + EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser'; + END IF; +END $$; diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 37090b08fd..6a87263821 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -777,19 +777,22 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { // Add new migrations in numerical order. let migrations = [ - include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"), - include_str!("./migrations/0001-alter_roles.sql"), - include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"), - include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"), - include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"), - include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"), + include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"), + include_str!("./migrations/0002-alter_roles.sql"), + include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"), + include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"), + include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"), + include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"), include_str!( - "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql" + "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql" ), include_str!( - "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql" + "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql" + ), + include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"), + include_str!( + "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql" ), - include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"), ]; MigrationRunner::new(client, &migrations).run_migrations()?; diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index e62f3b8a47..487ac8f047 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -40,6 +40,7 @@ safekeeper_api.workspace = true postgres_connection.workspace = true storage_broker.workspace = true utils.workspace = true +whoami.workspace = true compute_api.workspace = true workspace_hack.workspace = true diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index c3cfc140da..c8ac5d8981 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -1,9 +1,9 @@ //! Code to manage the storage broker //! -//! In the local test environment, the data for each safekeeper is stored in +//! In the local test environment, the storage broker stores its data directly in //! //! ```text -//! .neon/safekeepers/ +//! .neon //! ``` use std::time::Duration; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index f0403b1796..ba4f98d945 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -1,8 +1,10 @@ //! Code to manage pageservers //! -//! In the local test environment, the pageserver stores its data directly in +//! In the local test environment, the data for each pageserver is stored in //! -//! .neon/ +//! ```text +//! .neon/pageserver_ +//! ``` //! use std::collections::HashMap; @@ -23,6 +25,7 @@ use pageserver_client::mgmt_api; use postgres_backend::AuthType; use postgres_connection::{parse_host_port, PgConnectionConfig}; use utils::auth::{Claims, Scope}; +use utils::id::NodeId; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, @@ -72,6 +75,10 @@ impl PageServerNode { } } + fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document { + toml_edit::Document::from_str(&format!("id={node_id}")).unwrap() + } + fn pageserver_init_make_toml( &self, conf: NeonLocalInitPageserverConf, @@ -184,6 +191,19 @@ impl PageServerNode { .write_all(config.to_string().as_bytes()) .context("write pageserver toml")?; drop(config_file); + + let identity_file_path = datadir.join("identity.toml"); + let mut identity_file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(identity_file_path) + .with_context(|| format!("open identity toml for write: {config_file_path:?}"))?; + let identity_toml = self.pageserver_make_identity_toml(node_id); + identity_file + .write_all(identity_toml.to_string().as_bytes()) + .context("write identity toml")?; + drop(identity_toml); + // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config // Write metadata file, used by pageserver on startup to register itself with @@ -349,11 +369,6 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, - trace_read_requests: settings - .remove("trace_read_requests") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'trace_read_requests' as bool")?, eviction_policy: settings .remove("eviction_policy") .map(serde_json::from_str) @@ -454,11 +469,6 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, - trace_read_requests: settings - .remove("trace_read_requests") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'trace_read_requests' as bool")?, eviction_policy: settings .remove("eviction_policy") .map(serde_json::from_str) diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 47103a2e0a..d7aedd711a 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -29,7 +29,6 @@ use utils::{ pub struct StorageController { env: LocalEnv, listen: String, - path: Utf8PathBuf, private_key: Option>, public_key: Option, postgres_port: u16, @@ -41,6 +40,8 @@ const COMMAND: &str = "storage_controller"; const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; +const DB_NAME: &str = "storage_controller"; + #[derive(Serialize, Deserialize)] pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, @@ -65,10 +66,6 @@ pub struct InspectResponse { impl StorageController { pub fn from_env(env: &LocalEnv) -> Self { - let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone()) - .unwrap() - .join("attachments.json"); - // Makes no sense to construct this if pageservers aren't going to use it: assume // pageservers have control plane API set let listen_url = env.control_plane_api.clone().unwrap(); @@ -128,7 +125,6 @@ impl StorageController { Self { env: env.clone(), - path, listen, private_key, public_key, @@ -203,7 +199,6 @@ impl StorageController { /// /// Returns the database url pub async fn setup_database(&self) -> anyhow::Result { - const DB_NAME: &str = "storage_controller"; let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port); let pg_bin_dir = self.get_pg_bin_dir().await?; @@ -232,6 +227,30 @@ impl StorageController { Ok(database_url) } + pub async fn connect_to_database( + &self, + ) -> anyhow::Result<( + tokio_postgres::Client, + tokio_postgres::Connection, + )> { + tokio_postgres::Config::new() + .host("localhost") + .port(self.postgres_port) + // The user is the ambient operating system user name. + // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400 + // + // Until we get there, use the ambient operating system user name. + // Recent tokio-postgres versions default to this if the user isn't specified. + // But tokio-postgres fork doesn't have this upstream commit: + // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79 + // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399 + .user(&whoami::username()) + .dbname(DB_NAME) + .connect(tokio_postgres::NoTls) + .await + .map_err(anyhow::Error::new) + } + pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> { // Start a vanilla Postgres process used by the storage controller for persistence. let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone()) @@ -256,18 +275,21 @@ impl StorageController { if !status.success() { anyhow::bail!("initdb failed with status {status}"); } - - // Write a minimal config file: - // - Specify the port, since this is chosen dynamically - // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing - // the storage controller we don't want a slow local disk to interfere with that. - tokio::fs::write( - &pg_data_path.join("postgresql.conf"), - format!("port = {}\nfsync=off\n", self.postgres_port), - ) - .await?; }; + // Write a minimal config file: + // - Specify the port, since this is chosen dynamically + // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing + // the storage controller we don't want a slow local disk to interfere with that. + // + // NB: it's important that we rewrite this file on each start command so we propagate changes + // from `LocalEnv`'s config file (`.neon/config`). + tokio::fs::write( + &pg_data_path.join("postgresql.conf"), + format!("port = {}\nfsync=off\n", self.postgres_port), + ) + .await?; + println!("Starting storage controller database..."); let db_start_args = [ "-w", @@ -296,11 +318,38 @@ impl StorageController { // Run migrations on every startup, in case something changed. let database_url = self.setup_database().await?; + // We support running a startup SQL script to fiddle with the database before we launch storcon. + // This is used by the test suite. + let startup_script_path = self + .env + .base_data_dir + .join("storage_controller_db.startup.sql"); + let startup_script = match tokio::fs::read_to_string(&startup_script_path).await { + Ok(script) => { + tokio::fs::remove_file(startup_script_path).await?; + script + } + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + // always run some startup script so that this code path doesn't bit rot + "BEGIN; COMMIT;".to_string() + } else { + anyhow::bail!("Failed to read startup script: {e}") + } + } + }; + let (mut client, conn) = self.connect_to_database().await?; + let conn = tokio::spawn(conn); + let tx = client.build_transaction(); + let tx = tx.start().await?; + tx.batch_execute(&startup_script).await?; + tx.commit().await?; + drop(client); + conn.await??; + let mut args = vec![ "-l", &self.listen, - "-p", - self.path.as_ref(), "--dev", "--database-url", &database_url, diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml index f96f0084b2..be69208d0d 100644 --- a/control_plane/storcon_cli/Cargo.toml +++ b/control_plane/storcon_cli/Cargo.toml @@ -17,6 +17,7 @@ pageserver_client.workspace = true reqwest.workspace = true serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } +storage_controller_client.workspace = true thiserror.workspace = true tokio.workspace = true tracing.workspace = true diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index b2c5dfe58a..5c1add070a 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -14,15 +14,15 @@ use pageserver_api::{ }, shard::{ShardStripeSize, TenantShardId}, }; -use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; +use pageserver_client::mgmt_api::{self}; use reqwest::{Method, StatusCode, Url}; -use serde::{de::DeserializeOwned, Serialize}; use utils::id::{NodeId, TenantId}; use pageserver_api::controller_api::{ NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, TenantShardMigrateRequest, TenantShardMigrateResponse, }; +use storage_controller_client::control_api::Client; #[derive(Subcommand, Debug)] enum Command { @@ -56,6 +56,10 @@ enum Command { #[arg(long)] scheduling: Option, }, + NodeDelete { + #[arg(long)] + node_id: NodeId, + }, /// Modify a tenant's policies in the storage controller TenantPolicy { #[arg(long)] @@ -245,64 +249,6 @@ impl FromStr for NodeAvailabilityArg { } } -struct Client { - base_url: Url, - jwt_token: Option, - client: reqwest::Client, -} - -impl Client { - fn new(base_url: Url, jwt_token: Option) -> Self { - Self { - base_url, - jwt_token, - client: reqwest::ClientBuilder::new() - .build() - .expect("Failed to construct http client"), - } - } - - /// Simple HTTP request wrapper for calling into storage controller - async fn dispatch( - &self, - method: Method, - path: String, - body: Option, - ) -> mgmt_api::Result - where - RQ: Serialize + Sized, - RS: DeserializeOwned + Sized, - { - // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out - // for general purpose API access. - let url = Url::from_str(&format!( - "http://{}:{}/{path}", - self.base_url.host_str().unwrap(), - self.base_url.port().unwrap() - )) - .unwrap(); - - let mut builder = self.client.request(method, url); - if let Some(body) = body { - builder = builder.json(&body) - } - if let Some(jwt_token) = &self.jwt_token { - builder = builder.header( - reqwest::header::AUTHORIZATION, - format!("Bearer {jwt_token}"), - ); - } - - let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?; - let response = response.error_from_body().await?; - - response - .json() - .await - .map_err(pageserver_client::mgmt_api::Error::ReceiveBody) - } -} - #[tokio::main] async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); @@ -337,7 +283,7 @@ async fn main() -> anyhow::Result<()> { } Command::TenantCreate { tenant_id } => { storcon_client - .dispatch( + .dispatch::<_, ()>( Method::POST, "v1/tenant".to_string(), Some(TenantCreateRequest { @@ -357,13 +303,16 @@ async fn main() -> anyhow::Result<()> { tracing::info!("Delete status: {}", status); } Command::Nodes {} => { - let resp = storcon_client + let mut resp = storcon_client .dispatch::<(), Vec>( Method::GET, "control/v1/node".to_string(), None, ) .await?; + + resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr)); + let mut table = comfy_table::Table::new(); table.set_header(["Id", "Hostname", "Scheduling", "Availability"]); for node in resp { @@ -395,13 +344,16 @@ async fn main() -> anyhow::Result<()> { .await?; } Command::Tenants {} => { - let resp = storcon_client + let mut resp = storcon_client .dispatch::<(), Vec>( Method::GET, "control/v1/tenant".to_string(), None, ) .await?; + + resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id)); + let mut table = comfy_table::Table::new(); table.set_header([ "TenantId", @@ -650,6 +602,11 @@ async fn main() -> anyhow::Result<()> { .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None) .await?; } + Command::NodeDelete { node_id } => { + storcon_client + .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None) + .await?; + } Command::TenantSetTimeBasedEviction { tenant_id, period, diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index f646e36f59..33455e458a 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -33,7 +33,7 @@ echo $result | jq . generate_id timeline_id PARAMS=( - -sb + -sbf -X POST -H "Content-Type: application/json" -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 5503b6611a..6e15fdbe0d 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -31,25 +31,14 @@ services: restart: always image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} environment: - - BROKER_ENDPOINT='http://storage_broker:50051' - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 ports: #- 6400:6400 # pg protocol handler - 9898:9898 # http endpoints - entrypoint: - - "/bin/sh" - - "-c" - command: - - "/usr/local/bin/pageserver -D /data/.neon/ - -c \"broker_endpoint=$$BROKER_ENDPOINT\" - -c \"listen_pg_addr='0.0.0.0:6400'\" - -c \"listen_http_addr='0.0.0.0:9898'\" - -c \"remote_storage={endpoint='http://minio:9000', - bucket_name='neon', - bucket_region='eu-north-1', - prefix_in_bucket='/pageserver/'}\"" + volumes: + - ./pageserver_config:/data/.neon/ depends_on: - storage_broker - minio_create_buckets diff --git a/docker-compose/pageserver_config/identity.toml b/docker-compose/pageserver_config/identity.toml new file mode 100644 index 0000000000..20121327c7 --- /dev/null +++ b/docker-compose/pageserver_config/identity.toml @@ -0,0 +1 @@ +id=1234 diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml new file mode 100644 index 0000000000..76935453b6 --- /dev/null +++ b/docker-compose/pageserver_config/pageserver.toml @@ -0,0 +1,5 @@ +broker_endpoint='http://storage_broker:50051' +pg_distrib_dir='/usr/local/' +listen_pg_addr='0.0.0.0:6400' +listen_http_addr='0.0.0.0:9898' +remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' } diff --git a/docs/rfcs/034-ancestor-deletion.md b/docs/rfcs/034-ancestor-deletion.md new file mode 100644 index 0000000000..7341d930e2 --- /dev/null +++ b/docs/rfcs/034-ancestor-deletion.md @@ -0,0 +1,252 @@ +# Ancestor Timeline Deletion + +Created on: 2024-02-23 + +Author: John Spray + +# Summary + +When a tenant creates a new timeline that they will treat as their 'main' history, +it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently +this is necessary because it is forbidden to delete a timeline which has descendents. + +A new pageserver API is proposed to 'adopt' data from a parent timeline into +one of its children, such that the link between ancestor and child can be severed, +leaving the parent in a state where it may then be deleted. + +# Motivation + +Retaining parent timelines currently has two costs: + +- Cognitive load on users, who have to remember which is the "real" main timeline. +- Storage capacity cost, as the parent timeline will retain layers up to the + child's timeline point, even if the child fully covers its keyspace with image + layers and will never actually read from the parent. + +# Solution + +A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor` +will be added. The `timeline_id` in this URL is that of the _child_ timeline that we +wish to detach from its parent. + +On success, this API will leave the following state: + +- The detached child timeline will no longer have an ancestor, and will contain all + the data needed to service reads without recursing into an ancestor. +- Any other children of the parent whose timeline points were at a lower LSN than + the detached child timeline will be modified to have the child timeline as their + new parent. +- The parent timeline will still exist, but the child will no longer have it as an + ancestor. If this was the last timeline that depended on the parent, then the + parent will become deletable. + +This API's implementation will consist of a series of retryable steps, such that +on failures/timeout it can safely be called again to reach the target state. + +## Example + +### Before + +The user has "rolled back" their project to LSN X, resulting in a "new main" +timeline. The parent "old main" timeline still exists, and they would like +to clean it up. + +They have two other timelines A and B. A is from before the rollback point, +and B is from after the rollback point. + +``` +----"old main" timeline-------X--------------------------------------------> + | | | + |-> child A | | + |-> "new main" timeline | + -> child B + +``` + +### After calling detach ancestor API + +The "new main" timeline is no longer dependent on old main, and neither +is child A, because it had a branch point before X. + +The user may now choose to delete child B and "old main" to get to +a pristine state. Child B is likely to be unwanted since the user +chose to roll back to X, and it branches from after X. However, we +don't assume this in the API; it is up to the user to delete it. + +``` +|----"old main" timeline----------------------------------------------------> + | + | + | + -> child B + +|----"new main" timeline---------> + | + |-> child A + + +``` + +### After removing timelines + +We end up with a totally clean state that leaves no trace that a rollback +ever happened: there is only one root timeline. + +``` +| ----"new main" timeline-----------> + | + |-> child A + + +``` + +## Caveats + +Important things for API users to bear in mind: + +- this API does not delete the parent timeline: you must still do that explicitly. +- if there are other child timelines ahead of the branch point of the detached + child, the parent won't be deletable: you must either delete or detach those + children. +- do _not_ simply loop over all children and detach them all: this can have an + extremely high storage cost. The detach ancestor API is intended for use on a single + timeline to make it the new "main". +- The detach ancestor API should also not be + exposed directly to the user as button/API, because they might decide + to click it for all the children and thereby generate many copies of the + parent's data -- the detach ancestor API should be used as part + of a high level "clean up after rollback" feature. + +## `detach_ancestor` API implementation + +Terms used in the following sections: + +- "the child": the timeline whose ID is specified in the detach ancestor API URL, also + called "new main" in the example. +- "the parent": the parent of "the child". Also called "old main" in the example. +- "the branch point" the ancestor_lsn of "the child" + +### Phase 1: write out adopted layers to S3 + +The child will "adopt" layers from the parent, such that its end state contains +all the parent's history as well as its own. + +For all layers in the parent's layer map whose high LSN is below the branch +point, issue S3 CopyObject requests to duplicate them into the child timeline's +prefix. Do not add them to the child's layer map yet. + +For delta layers in the parent's layer map which straddle the branch point, read them +and write out only content up to the branch point into new layer objects. + +This is a long running operation if the parent has many layers: it should be +implemented in a way that resumes rather than restarting from scratch, if the API +times out and is called again. + +As an optimization, if there are no other timelines that will be adopted into +the child, _and_ the child's image layers already full cover the branch LSN, +then we may skip adopting layers. + +### Phase 2: update the child's index + +Having written out all needed layers in phase 1, atomically link them all +into the child's IndexPart and upload to S3. This may be done while the +child Timeline is still running. + +### Phase 3: modify timelines ancestry + +Modify the child's ancestor to None, and upload its IndexPart to persist the change. + +For all timelines which have the same parent as the child, and have a branch +point lower than our branch point, switch their ancestor_timeline to the child, +and upload their IndexPart to persist the change. + +## Alternatives considered + +### Generate full image layer on child, rather than adopting parent deltas + +This would work for the case of a single child, but would prevent re-targeting +other timelines that depended on the parent. If we detached many children this +way, the storage cost would become prohibitive (consider a 1TB database with +100 child timelines: it would cost 100TiB if they all generated their own image layers). + +### Don't rewrite anything: just fake it in the API + +We could add a layer of indirection that let a child "pretend" that it had no +ancestor, when in reality it still had the parent. The pageserver API could +accept deletion of ancestor timelines, and just update child metadata to make +them look like they have no ancestor. + +This would not achieve the desired reduction in storage cost, and may well be more +complex to maintain than simply implementing the API described in this RFC. + +### Avoid copying objects: enable child index to use parent layers directly + +We could teach IndexPart to store a TimelineId for each layer, such that a child +timeline could reference a parent's layers directly, rather than copying them +into the child's prefix. + +This would impose a cost for the normal case of indices that only target the +timeline's own layers, add complexity, and break the useful simplifying +invariant that timelines "own" their own path. If child timelines were +referencing layers from the parent, we would have to ensure that the parent +never runs GC/compaction again, which would make the API less flexible (the +proposal in this RFC enables deletion of the parent but doesn't require it.) + +## Performance + +### Adopting layers + +- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands + of such requests: this can take up to tens of seconds and will compete for RemoteStorage + semaphore units with other activity on the pageserver. +- If we are running on storage backend that doesn't implement CopyObject, then + this part will be much more expensive as we would stream all layer content + through the pageserver. This is no different to issuing a lot + of reads to a timeline that does not have a warm local cache: it will move + a lot of gigabytes, but that shouldn't break anything. +- Generating truncated layers for delta that straddle the branch point will + require streaming read/write of all the layers in question. + +### Updating timeline ancestry + +The simplest way to update timeline ancestry will probably be to stop and start +all the Timeline objects: this is preferable to the complexity of making their +ancestry mutable at runtime. + +There will be a corresponding "stutter" in the availability of the timelines, +of the order 10-100ms, which is the time taken to upload their IndexPart, and +restart the Timeline. + +# Interaction with other features + +## Concurrent timeline creation + +If new historic timelines are created using the parent as an ancestor while the +detach ancestor API is running, they will not be re-parented to the child. This +doesn't break anything, but it leaves the parent in a state where it might not +be possible to delete it. + +Since timeline creations are an explicit user action, this is not something we need to +worry about as the storage layer: a user who wants to delete their parent timeline will not create +new children, and if they do, they can choose to delete those children to +enable deleting the parent. + +For the least surprise to the user, before starting the detach ancestor branch +operation, the control plane should wait until all branches are created and not +allow any branches to be created before the branch point on the ancestor branch +while the operation is ongoing. + +## WAL based disaster recovery + +WAL based disaster recovery currently supports only restoring of the main +branch. Enabling WAL based disaster recovery in the future requires that we +keep a record which timeline generated the WAL and at which LSN was a parent +detached. Keep a list of timeline ids and the LSN in which they were detached in +the `index_part.json`. Limit the size of the list to 100 first entries, after +which the WAL disaster recovery will not be possible. + +## Sharded tenants + +For sharded tenants, calls to the detach ancestor API will pass through the storage +controller, which will handle them the same as timeline creations: invoke first +on shard zero, and then on all the other shards. diff --git a/docs/rfcs/035-timeline-archive.md b/docs/rfcs/035-timeline-archive.md new file mode 100644 index 0000000000..c834216962 --- /dev/null +++ b/docs/rfcs/035-timeline-archive.md @@ -0,0 +1,507 @@ +# Timeline Archival + +## Summary + +This RFC describes a mechanism for pageservers to eliminate local storage + compute work +for timelines which are not in use, in response to external API calls to "archive" a timeline. + +The archived state roughly corresponds to fully offloading a timeline to object storage, such +that its cost is purely the cost of that object storage. + +## Motivation + +Archived timelines serve multiple purposes: +- Act as a 'snapshot' for workloads that would like to retain restorable copies of their + database from longer ago than their PITR window. +- Enable users to create huge numbers of branches (e.g. one per github PR) without having + to diligently clean them up later to avoid overloading the pageserver (currently we support + up to ~500 branches per tenant). + +### Prior art + +Most storage and database systems have some form of snapshot, which can be implemented several ways: +1. full copies of data (e.g. an EBS snapshot to S3) +2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS. +3. a series of snapshots which are CoW or de-duplicated relative to one another. + +Today's Neon branches are approximately like `2.`, although due to implementation details branches +often end up storing much more data than they really need, as parent branches assume that all data +at the branch point is needed. The layers pinned in the parent branch may have a much larger size +than the physical size of a compressed image layer representing the data at the branch point. + +## Requirements + +- Enter & exit the archived state in response to external admin API calls +- API calls to modify the archived state are atomic and durable +- An archived timeline should eventually (once out of PITR window) use an efficient compressed + representation, and avoid retaining arbitrarily large data in its parent branch. +- Remote object GETs during tenant start may be O(N) with the number of _active_ branches, + but must not scale with the number of _archived_ branches. +- Background I/O for archived branches should only be done a limited number of times to evolve them + to a long-term-efficient state (e.g. rewriting to image layers). There should be no ongoing "housekeeping" + overhead for archived branches, including operations related to calculating sizes for billing. +- The pageserver should put no load on the safekeeper for archived branches. +- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch + to a performant state in a short time (linear with the branch's logical size) + +## Non Goals + +- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored + in Neon's internal format. +- Compute cold starts after activating an archived branch will not have comparable performance to + cold starts on an active branch. +- Archived branches will not use any new/additional compression or de-duplication beyond what + is already implemented for image layers (zstd per page). +- The pageserver will not "auto start" archived branches in response to page_service API requests: they + are only activated explicitly via the HTTP API. +- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will + remain on local disk, although existing eviction mechanisms will remove any segments from local disk. +- We will not expose any prometheus metrics for archived timelines, or make them visible in any + detailed HTTP APIs other than the specific API for listing archived timelines. +- A parent branch may not be archived unless all its children are. + +## Impacted Components + +pageserver, storage controller + +## Terminology + +**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller +may assume that this branch is now very cheap to store, although this may not be physically so until the +branch proceeds to the offloaded state. + +**Active** branches are branches which are available for use by page_service clients, and have a relatively +high cost due to consuming local storage. + +**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such +that they now consume minimal runtime resources and have a cost similar to the cost of object storage. + +**Activate** (verb): transition from Archived to Active + +**Archive** (verb): transition from Active to Archived + +**Offload** (verb): transition from Archived to Offloaded + +**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load. + +**Warm up** (verb): operation done on an active branch, by downloading its active layers. Once a branch is +warmed up, good performance will be available to page_service clients. + +## Implementation + +### High level flow + +We may think of a timeline which is archived and then activated as proceeding through a series of states: + +```mermaid +stateDiagram + [*] --> Active(warm) + Active(warm) --> Archived + Archived --> Offloaded + Archived --> Active(warm) + Offloaded --> Active(cold) + Active(cold) --> Active(warm) +``` + +Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles +of branches will be: +- Very frequent: Short lived branches: Active -> Deleted +- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted +- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active + +These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination +of: +- the timeline's lifecycle state: active or archived, stored in the timeline's index +- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the + manifest of offloaded timelines. +- cache state (whether it's warm or cold). + +### Storage format changes + +There are two storage format changes: +1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to + be considered active or archived. +2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load + at startup (and is available for storing other small, rarely changing tenant-wide attributes in future) + +The manifest object will have a format like this: +``` +{ + "offload_timelines": [ + { + "timeline_id": ... + "last_record_lsn": ... + "last_record_lsn_time": ... + "pitr_interval": ... + "last_gc_lsn": ... # equal to last_record_lsn if this branch has no history (i.e. a snapshot) + "logical_size": ... # The size at last_record_lsn + "physical_size" ... + "parent": Option<{ + "timeline_id"... + "lsn"... # Branch point LSN on the parent + "requires_data": bool # True if this branch depends on layers in its parent, identify it here + + }> + } + ] +} +``` + +The information about a timeline in its offload state is intentionally minimal: just enough to decide: +- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this + by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn. +- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing + layers that the archived branch depends on +- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request + is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then + we don't need to go to S3 for the deletion. +- How much archived space to report in consumption metrics + +The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total +set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded` +(offloaded timelines). + +For split-brain protection, the manifest object will be written with a generation suffix, in the same way as +index_part objects are (see [generation numbers RFC](025-generation-numbers.md)). This will add some complexity, but +give us total safety against two pageservers with the same tenant attached fighting over the object. Existing code +for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover +the manifest file. + +### API & Timeline state + +Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart. This will +be controlled by a new per-timeline `configure` endpoint. This is intentionally generic naming, which +may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval +a per-timeline configuration). + +`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure` +``` +{ + 'state': 'active|archive' +} +``` + +When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded. + +When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part, +**and** the `Timeline` object has been instantiated and activated. This will require reading the timeline's +index, but not any data: it should be about as fast as a couple of small S3 requests. + +The API will be available with identical path via the storage controller: calling this on a sharded tenant +will simply map the API call to all the shards. + +Archived timelines may never have descendent timelines which are active. This will be enforced at the API level, +such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires +that all its descendents are archived. It is the callers responsibility to walk the hierarchy of timelines +in the proper order if they would like to archive whole trees of branches. + +Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically +for archived timelines will be added: this is for use in support/debug: + +``` +GET /v1/tenants/{tenant_id}/archived_timelines + +{ + ...same per-timeline content as the tenant manifest... +} + +``` + +### Tenant attach changes + +Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline +we load their index_part.json. To avoid the number of GETs scaling linearly with the number of archived +timelines, we must have a single object that tells us which timelines do not need to be loaded. The +number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic +because each request covers 1000 timelines. + +This is **not** literally the same as the set of timelines who have state=archived. Rather, it is +the set of timelines which have been offloaded in the background after their state was set to archived. + +We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't +exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need +to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying +to delete an offloaded timeline. + +### Warm-up API + +`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234` + +This API will be similar to the existing `download_remote_layers` API, but smarter: +- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read) +- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress + of downloads, so that the caller can poll. + +The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set +of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers +can possibly be read from these LSNs. This concept of layer visibility is more generally useful for cache +eviction and heatmaps, as well as in this specific case of warming up a timeline. + +The caller does not have to wait for the warm up API, or call it at all. But it is strongly advised +to call it, because otherwise populating local contents for a timeline can take a long time when waiting +for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite +volatile. + +### Background work + +Archived branches are not subject to normal compaction. Instead, when the compaction loop encounters +an archived branch, it will consider rewriting the branch to just image layers if the branch has no history +([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk +if its state permits that. + +Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider +optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR +has elapsed and it can now be rewritten to image layers. + +#### Archive branch offload + +Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do +any actual work. + +This work is done in the background compaction loop. It makes sense to tag this work on to the compaction +loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency. + +The condition for offload is simple: + - a `Timeline` object exists with state `Archived` + - the timeline does not have any non-offloaded children. + + Regarding the condition that children must be offloaded, this will always be eventually true, because + we enforce at the API level that children of archived timelines must themselves be archived, and all + archived timelines will eventually be offloaded. + +Offloading a timeline is simple: +- Read the timeline's attributes that we will store in its offloaded state (especially its logical size) +- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it) +- Erase all the timeline's content from local storage (`remove_dir_all` on its path) +- Write the tenant manifest to S3 to prevent this timeline being loaded on next start. + +#### Archive branch optimization (flattening) + +When we offloaded a branch, it might have had some history that prevented rewriting it to a single +point in time set of image layers. For example, a branch might have several days of writes and a 7 +day PITR: when we archive it, it still has those days of history. + +Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by: +- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing + a point in time compared with delta layers +- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor + for data, i.e. the ancestor is free to GC layers files at+below the branch point + +Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the +branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes +a true snapshot at that LSN. + +It is not always more efficient to flatten a branch than to keep some extra history on the parent: this +is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper) + +Archive branch optimization should be done _before_ background offloads during compaction, because there may +be timelines which are ready to be offloaded but also would benefit from the optimization step before +being offloaded. For example, a branch which has already fallen out of PITR window and has no history +of its own may be immediately re-written as a series of image layers before being offloaded. + +### Consumption metrics + +Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating +that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived +vs. ordinary content. + +Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size` +variant of `MetricsKey`: receivers are then free to bill on this metric as they please. + +### Secondary locations + +Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby +when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents +will be dropped from secondary locations. + +### Sharding + +Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in +the same way that timeline creation and deletion is done. There are no special rules about ordering: +the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline. + +Since consumption metrics are only transmitted from shard zero, the state of archival on this shard +will be authoritative for consumption metrics. + +## Error cases + +### Errors in sharded tenants + +If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed +state, where a timeline is archived on some shards but not on others. + +We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline +are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest). +In the transient case callers are expected to retry until success, or to make appropriate API calls to clear +up their mistake. We rely on this good behavior of callers to eventually get timelines into a consistent +state across all shards. If callers do leave a timeline in an inconsistent state across shards, this doesn't +break anything, it's just "weird". + +This is similar to the status quo for timeline creation and deletion: callers are expected to retry +these operations until they succeed. + +### Archiving/activating + +Archiving/activating a timeline can fail in a limited number of ways: +1. I/O error storing/reading the timeline's updated index + - These errors are always retryable: a fundamental design assumption of the pageserver is that remote + storage errors are always transient. +2. NotFound if the timeline doesn't exist + - Callers of the API are expected to avoid calling deletion and archival APIs concurrently. + - The storage controller has runtime locking to prevent races such as deleting a timeline while + archiving it. +3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated + - Callers are expected to do their own checks to avoid hitting this case. If they make + a mistake and encounter this error, they should give up. + +### Offloading + +Offloading can only fail if remote storage is unavailable, which would prevent us from writing the +tenant manifest. In such error cases, we give up in the expectation that offloading will be tried +again at the next iteration of the compaction loop. + +### Archive branch optimization + +Optimization is a special form of compaction, so can encounter all the same errors as regular compaction +can: it should return Result<(), CompactionError>, and as with compaction it will be retried on +the next iteration of the compaction loop. + +## Optimizations + +### Delaying storage optimization if retaining parent layers is cheaper + +Optimizing archived branches to image layers and thereby enabling parent branch GC to progress +is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they +are offloaded to S3 they're totally safe, inert things. + +However, in some cases it can be advantageous to retain extra history on their parent branch rather +than flattening the archived branch. For example, if a 1TB parent branch is rather slow-changing (1GB +of data per day), and archive branches are being created nightly, then writing out full 1TB image layers +for each nightly branch is inefficient compared with just keeping more history on the main branch. + +Getting this right requires consideration of: +- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to + write out extra image layers, then it might make more sense to just write out the image layers on + the archived branch. +- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes + the layer map (and index_part) bigger. There are practical limits beyond which writing an indefinitely + large layer map can cause problems elsewhere. + +This optimization can probably be implemented quite cheaply with some basic heuristics like: +- don't bother doing optimization on an archive branch if the LSN distance between + its branch point and the end of the PITR window is <5% of the logical size of the archive branch. +- ...but, Don't keep more history on the main branch than double the PITR + +### Creating a timeline in archived state (a snapshot) + +Sometimes, one might want to create a branch with no history, which will not be written to +before it is archived. This is a snapshot, although we do not require a special snapshot API, +since a snapshot can be represented as a timeline with no history. + +This can be accomplished by simply creating a timeline and then immediately archiving it, but +that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage +broker to try and ingest WAL, before being shutdown in the subsequent archival call. To explicitly +support this common special case, we may add a parameter to the timeline creation API which +creates a timeline directly into the archived state. + +Such a timeline creation will do exactly two I/Os at creation time: +- write the index_part object to record the timeline's existence +- when the timeline is offloaded in the next iteration of the compaction loop (~20s later), + write the tenant manifest. + +Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake +up the 'snapshot' branch and write out image layers. + +## Future Work + +### Enabling `fullbackup` dumps from archive branches + +It would be useful to be able to export an archive branch to another system, or for use in a local +postgres database. + +This could be implemented as a general capability for all branches, in which case it would "just work" +for archive branches by activating them. However, downloading all the layers in a branch just to generate +a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches +which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk. + +Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem +is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup +stream to S3 in an intermediate format and, then having one node stitch them together). + +### Tagging layers from archived branches + +When we know a layer is an image layer written for an archived branch that has fallen off the PITR window, +we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even +cheaper storage. + +This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver +external hints on which branches are likely to be reactivated, and which branches are good candidates for +tagging for low performance storage. + +Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes. Other clouds' object +stores have similar mechanisms. + +### Storing sequences of archive branches as deltas + +When archived branches are used as scheduled snapshots, we could store them even more efficiently +by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the +storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified +pages). This is the kind of encoding that many backup storage systems use. + +The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding +vs. just writing out a simple stream of the entire database. For smaller databases, writing out a full +copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds, +so the complexity tradeoff of diff-encoding it is dubious). + +One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the +pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that +we can say: "A branch exists from Monday night. I have Monday night's data still active in the main branch, +so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's +delta snapshot". + +Clearly this all requires careful housekeeping to retain the relationship between branches that depend on +each other: perhaps this would be done by making the archive branches have child/parent relationships with +each other, or perhaps we would permit them to remain children of their original parent, but additionally +have a relationship with the snapshot they're encoded relative to. + +Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring +out how frequently to write a full copy is important. This is essentially a zoomed-out version of what +we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline. + + +## FAQ/Alternatives + +### Store all timelines in the tenant manifest + +Rather than special-casing offloaded timelines in the offload manifest, we could store a total +manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on +startup. + +That would be a more invasive change (require hooking in to timeline creation), and would +generate much more I/O to this manifest for tenants that had many branches _and_ frequent +create/delete cycles for short lived branches. Restricting the manifest to offloaded timelines +means that we only have to cope with the rate at which long-lived timelines are archived, rather +than the rate at which sort lived timelines are created & destroyed. + +### Automatically archiving/activating timelines without external API calls + +We could implement TTL driven offload of timelines, waking them up when a page request +arrives. + +This has downsides: +- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't + know which of their branches are in this state, and might get a surprise when they try + to use such a branch. +- Price fluctuation: if the archival of a branch is used in end user pricing, then users + prefer clarity & consistency. Ideally a branch's storage should cost the same from the moment it + is created, rather than having a usage-dependency storage price. +- Complexity: enabling the page service to call up into the Tenant to activate a timeline + would be awkward, compared with an external entry point. + +### Make offloaded a state of Timeline + +To reduce the operator-facing complexity of having some timelines APIs that only return +non-offloaded timelines, we could build the offloaded state into the Timeline type. + +`timeline.rs` is already one of the most egregiously long source files in the tree, so +this is rejected on the basis that we need to avoid making that complexity worse. \ No newline at end of file diff --git a/docs/storage_controller.md b/docs/storage_controller.md index daf4d0c8b7..6d2ef929a4 100644 --- a/docs/storage_controller.md +++ b/docs/storage_controller.md @@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration: - Use `diesel migration generate ` to create a new migration - Populate the SQL files in the `migrations/` subdirectory - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically. - - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service` + - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller` - Commit the migration files and the changes to schema.rs - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again. - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed. diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index f05c1315ea..d0e1eb6b28 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -87,7 +87,7 @@ pub struct TenantLocateResponse { pub shard_params: ShardParameters, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Debug)] pub struct TenantDescribeResponse { pub tenant_id: TenantId, pub shards: Vec, @@ -110,7 +110,7 @@ pub struct NodeDescribeResponse { pub listen_pg_port: u16, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Debug)] pub struct TenantDescribeResponseShard { pub tenant_shard_id: TenantShardId, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index d360cc6e87..591c45d908 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -5,7 +5,6 @@ pub mod utilization; pub use utilization::PageserverUtilization; use std::{ - borrow::Cow, collections::HashMap, io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, @@ -20,7 +19,6 @@ use serde::{Deserialize, Serialize}; use serde_with::serde_as; use utils::{ completion, - history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, lsn::Lsn, serde_system_time, @@ -294,7 +292,6 @@ pub struct TenantConfig { pub walreceiver_connect_timeout: Option, pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, - pub trace_read_requests: Option, pub eviction_policy: Option, pub min_resident_size_override: Option, pub evictions_low_residence_duration_metric_threshold: Option, @@ -652,6 +649,17 @@ pub struct TenantDetails { pub timelines: Vec, } +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)] +pub enum TimelineArchivalState { + Archived, + Unarchived, +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] +pub struct TimelineArchivalConfigRequest { + pub state: TimelineArchivalState, +} + /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { @@ -716,58 +724,7 @@ pub struct LayerMapInfo { pub historic_layers: Vec, } -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)] -#[repr(usize)] -pub enum LayerAccessKind { - GetValueReconstructData, - Iter, - KeyIter, - Dump, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LayerAccessStatFullDetails { - pub when_millis_since_epoch: u64, - pub task_kind: Cow<'static, str>, - pub access_kind: LayerAccessKind, -} - -/// An event that impacts the layer's residence status. -#[serde_as] -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LayerResidenceEvent { - /// The time when the event occurred. - /// NB: this timestamp is captured while the residence status changes. - /// So, it might be behind/ahead of the actual residence change by a short amount of time. - /// - #[serde(rename = "timestamp_millis_since_epoch")] - #[serde_as(as = "serde_with::TimestampMilliSeconds")] - pub timestamp: SystemTime, - /// The new residence status of the layer. - pub status: LayerResidenceStatus, - /// The reason why we had to record this event. - pub reason: LayerResidenceEventReason, -} - -/// The reason for recording a given [`LayerResidenceEvent`]. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub enum LayerResidenceEventReason { - /// The layer map is being populated, e.g. during timeline load or attach. - /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`]. - /// We need to record such events because there is no persistent storage for the events. - /// - // https://github.com/rust-lang/rust/issues/74481 - /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html - /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote - LayerLoad, - /// We just created the layer (e.g., freeze_and_flush or compaction). - /// Such layers are always [`LayerResidenceStatus::Resident`]. - LayerCreate, - /// We on-demand downloaded or evicted the given layer. - ResidenceChange, -} - -/// The residence status of the layer, after the given [`LayerResidenceEvent`]. +/// The residence status of a layer #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub enum LayerResidenceStatus { /// Residence status for a layer file that exists locally. @@ -777,23 +734,16 @@ pub enum LayerResidenceStatus { Evicted, } -impl LayerResidenceEvent { - pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self { - Self { - status, - reason, - timestamp: SystemTime::now(), - } - } -} - +#[serde_as] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerAccessStats { - pub access_count_by_access_kind: HashMap, - pub task_kind_access_flag: Vec>, - pub first: Option, - pub accesses_history: HistoryBufferWithDropCounter, - pub residence_events_history: HistoryBufferWithDropCounter, + #[serde_as(as = "serde_with::TimestampMilliSeconds")] + pub access_time: SystemTime, + + #[serde_as(as = "serde_with::TimestampMilliSeconds")] + pub residence_time: SystemTime, + + pub visible: bool, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs index fc1f10e734..ae5a21bab9 100644 --- a/libs/pageserver_api/src/models/detach_ancestor.rs +++ b/libs/pageserver_api/src/models/detach_ancestor.rs @@ -1,6 +1,6 @@ use utils::id::TimelineId; -#[derive(Default, serde::Serialize)] +#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)] pub struct AncestorDetached { pub reparented_timelines: Vec, } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 23d82b90bd..414bce1b26 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true async-trait.workspace = true +async-stream.workspace = true once_cell.workspace = true aws-smithy-async.workspace = true aws-smithy-types.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 8e590b17c4..acd95a5255 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -15,7 +15,7 @@ use std::time::SystemTime; use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use anyhow::Result; use azure_core::request_options::{MaxResults, Metadata, Range}; -use azure_core::RetryOptions; +use azure_core::{Continuable, RetryOptions}; use azure_identity::DefaultAzureCredential; use azure_storage::StorageCredentials; use azure_storage_blobs::blob::CopyStatus; @@ -40,6 +40,7 @@ use crate::{ pub struct AzureBlobStorage { client: ContainerClient, + container_name: String, prefix_in_container: Option, max_keys_per_list_response: Option, concurrency_limiter: ConcurrencyLimiter, @@ -85,6 +86,7 @@ impl AzureBlobStorage { Ok(AzureBlobStorage { client, + container_name: azure_config.container_name.to_owned(), prefix_in_container: azure_config.prefix_in_container.to_owned(), max_keys_per_list_response, concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), @@ -238,6 +240,10 @@ impl AzureBlobStorage { _ = cancel.cancelled() => Err(Cancelled), } } + + pub fn container_name(&self) -> &str { + &self.container_name + } } fn to_azure_metadata(metadata: StorageMetadata) -> Metadata { @@ -261,30 +267,30 @@ fn to_download_error(error: azure_core::Error) -> DownloadError { } impl RemoteStorage for AzureBlobStorage { - async fn list( + fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> anyhow::Result { - let _permit = self.permit(RequestKind::List, cancel).await?; + ) -> impl Stream> { + // get the passed prefix or if it is not set use prefix_in_bucket value + let list_prefix = prefix + .map(|p| self.relative_path_to_name(p)) + .or_else(|| self.prefix_in_container.clone()) + .map(|mut p| { + // required to end with a separator + // otherwise request will return only the entry of a prefix + if matches!(mode, ListingMode::WithDelimiter) + && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) + { + p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + } + p + }); - let op = async { - // get the passed prefix or if it is not set use prefix_in_bucket value - let list_prefix = prefix - .map(|p| self.relative_path_to_name(p)) - .or_else(|| self.prefix_in_container.clone()) - .map(|mut p| { - // required to end with a separator - // otherwise request will return only the entry of a prefix - if matches!(mode, ListingMode::WithDelimiter) - && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) - { - p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - } - p - }); + async_stream::stream! { + let _permit = self.permit(RequestKind::List, cancel).await?; let mut builder = self.client.list_blobs(); @@ -300,21 +306,43 @@ impl RemoteStorage for AzureBlobStorage { builder = builder.max_results(MaxResults::new(limit)); } - let response = builder.into_stream(); - let response = response.into_stream().map_err(to_download_error); - let response = tokio_stream::StreamExt::timeout(response, self.timeout); - let response = response.map(|res| match res { - Ok(res) => res, - Err(_elapsed) => Err(DownloadError::Timeout), - }); + let mut next_marker = None; - let mut response = std::pin::pin!(response); + 'outer: loop { + let mut builder = builder.clone(); + if let Some(marker) = next_marker.clone() { + builder = builder.marker(marker); + } + let response = builder.into_stream(); + let response = response.into_stream().map_err(to_download_error); + let response = tokio_stream::StreamExt::timeout(response, self.timeout); + let response = response.map(|res| match res { + Ok(res) => res, + Err(_elapsed) => Err(DownloadError::Timeout), + }); - let mut res = Listing::default(); + let mut response = std::pin::pin!(response); - let mut max_keys = max_keys.map(|mk| mk.get()); - while let Some(entry) = response.next().await { - let entry = entry?; + let mut max_keys = max_keys.map(|mk| mk.get()); + let next_item = tokio::select! { + op = response.next() => Ok(op), + _ = cancel.cancelled() => Err(DownloadError::Cancelled), + }?; + let Some(entry) = next_item else { + // The list is complete, so yield it. + break; + }; + + let mut res = Listing::default(); + let entry = match entry { + Ok(entry) => entry, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue; + } + }; + next_marker = entry.continuation(); let prefix_iter = entry .blobs .prefixes() @@ -333,19 +361,19 @@ impl RemoteStorage for AzureBlobStorage { assert!(mk > 0); mk -= 1; if mk == 0 { - return Ok(res); // limit reached + yield Ok(res); // limit reached + break 'outer; } max_keys = Some(mk); } } + yield Ok(res); + + // We are done here + if next_marker.is_none() { + break; + } } - - Ok(res) - }; - - tokio::select! { - res = op => res, - _ = cancel.cancelled() => Err(DownloadError::Cancelled), } } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index d440c03a0e..0fed86f4b8 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -26,7 +26,7 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use bytes::Bytes; -use futures::stream::Stream; +use futures::{stream::Stream, StreamExt}; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; @@ -160,13 +160,18 @@ pub struct Listing { /// providing basic CRUD operations for storage files. #[allow(async_fn_in_trait)] pub trait RemoteStorage: Send + Sync + 'static { - /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2. - /// (see ``) + /// List objects in remote storage, with semantics matching AWS S3's [`ListObjectsV2`]. + /// + /// The stream is guaranteed to return at least one element, even in the case of errors + /// (in that case it's an `Err()`), or an empty `Listing`. + /// + /// The stream is not ending if it returns an error, as long as [`is_permanent`] returns false on the error. + /// The `next` function can be retried, and maybe in a future retry, there will be success. /// /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not /// from the absolute root of the bucket. /// - /// `mode` configures whether to use a delimiter. Without a delimiter all keys + /// `mode` configures whether to use a delimiter. Without a delimiter, all keys /// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are /// returned in `keys` (). @@ -175,13 +180,32 @@ pub trait RemoteStorage: Send + Sync + 'static { /// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure. /// + /// [`ListObjectsV2`]: + /// [`is_permanent`]: DownloadError::is_permanent + fn list_streaming( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream>; + async fn list( &self, prefix: Option<&RemotePath>, - _mode: ListingMode, + mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> Result; + ) -> Result { + let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel)); + let mut combined = stream.next().await.expect("At least one item required")?; + while let Some(list) = stream.next().await { + let list = list?; + combined.keys.extend_from_slice(&list.keys); + combined.prefixes.extend_from_slice(&list.prefixes); + } + Ok(combined) + } /// Streams the local file contents into remote into the remote storage entry. /// @@ -288,8 +312,8 @@ impl Debug for Download { /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. -#[derive(Clone)] // Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925 +#[derive(Clone)] pub enum GenericRemoteStorage> { LocalFs(LocalFs), AwsS3(Arc), @@ -298,13 +322,14 @@ pub enum GenericRemoteStorage> { } impl GenericRemoteStorage> { + // See [`RemoteStorage::list`]. pub async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> anyhow::Result { + ) -> Result { match self { Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await, Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await, @@ -313,6 +338,23 @@ impl GenericRemoteStorage> { } } + // See [`RemoteStorage::list_streaming`]. + pub fn list_streaming<'a>( + &'a self, + prefix: Option<&'a RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &'a CancellationToken, + ) -> impl Stream> + 'a { + match self { + Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)) + as Pin>>>, + Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), + Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), + Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), + } + } + /// See [`RemoteStorage::upload`] pub async fn upload( &self, @@ -443,7 +485,7 @@ impl GenericRemoteStorage> { } impl GenericRemoteStorage { - pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { + pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { let timeout = storage_config.timeout; Ok(match &storage_config.storage { RemoteStorageKind::LocalFs { local_path: path } => { @@ -458,7 +500,7 @@ impl GenericRemoteStorage { std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "".into()); info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?)) + Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?)) } RemoteStorageKind::AzureContainer(azure_config) => { let storage_account = azure_config @@ -504,6 +546,16 @@ impl GenericRemoteStorage { None => self.download(from, cancel).await, } } + + /// The name of the bucket/container/etc. + pub fn bucket_name(&self) -> Option<&str> { + match self { + Self::LocalFs(_s) => None, + Self::AwsS3(s) => Some(s.bucket_name()), + Self::AzureBlob(s) => Some(s.container_name()), + Self::Unreliable(_s) => None, + } + } } /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 1f7bcfc982..a4857b0bba 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -331,6 +331,17 @@ impl LocalFs { } impl RemoteStorage for LocalFs { + fn list_streaming( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> { + let listing = self.list(prefix, mode, max_keys, cancel); + futures::stream::once(listing) + } + async fn list( &self, prefix: Option<&RemotePath>, diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index ef1bd2c047..90ed48e06c 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -16,16 +16,10 @@ use std::{ use anyhow::{anyhow, Context as _}; use aws_config::{ - environment::credentials::EnvironmentVariableCredentialsProvider, - imds::credentials::ImdsCredentialsProvider, - meta::credentials::CredentialsProviderChain, - profile::ProfileFileCredentialsProvider, - provider_config::ProviderConfig, + default_provider::credentials::DefaultCredentialsChain, retry::{RetryConfigBuilder, RetryMode}, - web_identity_token::WebIdentityTokenCredentialsProvider, BehaviorVersion, }; -use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::{ config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, @@ -76,40 +70,27 @@ struct GetObjectRequest { } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { + pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { tracing::debug!( "Creating s3 remote storage for S3 bucket {}", remote_storage_config.bucket_name ); - let region = Some(Region::new(remote_storage_config.bucket_region.clone())); + let region = Region::new(remote_storage_config.bucket_region.clone()); + let region_opt = Some(region.clone()); - let provider_conf = ProviderConfig::without_region().with_region(region.clone()); - - let credentials_provider = { - // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - CredentialsProviderChain::first_try( - "env", - EnvironmentVariableCredentialsProvider::new(), - ) - // uses "AWS_PROFILE" / `aws sso login --profile ` - .or_else( - "profile-sso", - ProfileFileCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" - // needed to access remote extensions bucket - .or_else( - "token", - WebIdentityTokenCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses imds v2 - .or_else("imds", ImdsCredentialsProvider::builder().build()) - }; + // https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html + // https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html + // Incomplete list of auth methods used by this: + // * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + // * "AWS_PROFILE" / `aws sso login --profile ` + // * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" + // * http (ECS/EKS) container credentials + // * imds v2 + let credentials_provider = DefaultCredentialsChain::builder() + .region(region) + .build() + .await; // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off let sleep_impl: Arc = Arc::new(TokioSleep::new()); @@ -118,9 +99,9 @@ impl S3Bucket { #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ BehaviorVersion::v2023_11_09(), ) - .region(region) + .region(region_opt) .identity_cache(IdentityCache::lazy().build()) - .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) + .credentials_provider(credentials_provider) .sleep_impl(SharedAsyncSleep::from(sleep_impl)); let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| { @@ -405,6 +386,10 @@ impl S3Bucket { } Ok(()) } + + pub fn bucket_name(&self) -> &str { + &self.bucket_name + } } pin_project_lite::pin_project! { @@ -482,17 +467,16 @@ impl>> Stream for TimedDownload { } impl RemoteStorage for S3Bucket { - async fn list( + fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> Result { + ) -> impl Stream> { let kind = RequestKind::List; // s3 sdk wants i32 let mut max_keys = max_keys.map(|mk| mk.get() as i32); - let mut result = Listing::default(); // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix @@ -504,89 +488,99 @@ impl RemoteStorage for S3Bucket { }) }); - let _permit = self.permit(kind, cancel).await?; + async_stream::stream! { + let _permit = self.permit(kind, cancel).await?; - let mut continuation_token = None; + let mut continuation_token = None; + 'outer: loop { + let started_at = start_measuring_requests(kind); - loop { - let started_at = start_measuring_requests(kind); + // min of two Options, returning Some if one is value and another is + // None (None is smaller than anything, so plain min doesn't work). + let request_max_keys = self + .max_keys_per_list_response + .into_iter() + .chain(max_keys.into_iter()) + .min(); + let mut request = self + .client + .list_objects_v2() + .bucket(self.bucket_name.clone()) + .set_prefix(list_prefix.clone()) + .set_continuation_token(continuation_token.clone()) + .set_max_keys(request_max_keys); - // min of two Options, returning Some if one is value and another is - // None (None is smaller than anything, so plain min doesn't work). - let request_max_keys = self - .max_keys_per_list_response - .into_iter() - .chain(max_keys.into_iter()) - .min(); - let mut request = self - .client - .list_objects_v2() - .bucket(self.bucket_name.clone()) - .set_prefix(list_prefix.clone()) - .set_continuation_token(continuation_token) - .set_max_keys(request_max_keys); - - if let ListingMode::WithDelimiter = mode { - request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); - } - - let request = request.send(); - - let response = tokio::select! { - res = request => res, - _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout), - _ = cancel.cancelled() => return Err(DownloadError::Cancelled), - }; - - let response = response - .context("Failed to list S3 prefixes") - .map_err(DownloadError::Other); - - let started_at = ScopeGuard::into_inner(started_at); - - crate::metrics::BUCKET_METRICS - .req_seconds - .observe_elapsed(kind, &response, started_at); - - let response = response?; - - let keys = response.contents(); - let empty = Vec::new(); - let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty); - - tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); - - for object in keys { - let object_path = object.key().expect("response does not contain a key"); - let remote_path = self.s3_object_to_relative_path(object_path); - result.keys.push(remote_path); - if let Some(mut mk) = max_keys { - assert!(mk > 0); - mk -= 1; - if mk == 0 { - return Ok(result); // limit reached - } - max_keys = Some(mk); + if let ListingMode::WithDelimiter = mode { + request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); } + + let request = request.send(); + + let response = tokio::select! { + res = request => Ok(res), + _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout), + _ = cancel.cancelled() => Err(DownloadError::Cancelled), + }?; + + let response = response + .context("Failed to list S3 prefixes") + .map_err(DownloadError::Other); + + let started_at = ScopeGuard::into_inner(started_at); + + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &response, started_at); + + let response = match response { + Ok(response) => response, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue 'outer; + }, + }; + + let keys = response.contents(); + let prefixes = response.common_prefixes.as_deref().unwrap_or_default(); + + tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); + let mut result = Listing::default(); + + for object in keys { + let object_path = object.key().expect("response does not contain a key"); + let remote_path = self.s3_object_to_relative_path(object_path); + result.keys.push(remote_path); + if let Some(mut mk) = max_keys { + assert!(mk > 0); + mk -= 1; + if mk == 0 { + // limit reached + yield Ok(result); + break 'outer; + } + max_keys = Some(mk); + } + } + + // S3 gives us prefixes like "foo/", we return them like "foo" + result.prefixes.extend(prefixes.iter().filter_map(|o| { + Some( + self.s3_object_to_relative_path( + o.prefix()? + .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR), + ), + ) + })); + + yield Ok(result); + + continuation_token = match response.next_continuation_token { + Some(new_token) => Some(new_token), + None => break, + }; } - - // S3 gives us prefixes like "foo/", we return them like "foo" - result.prefixes.extend(prefixes.iter().filter_map(|o| { - Some( - self.s3_object_to_relative_path( - o.prefix()? - .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR), - ), - ) - })); - - continuation_token = match response.next_continuation_token { - Some(new_token) => Some(new_token), - None => break, - }; } - - Ok(result) } async fn upload( @@ -1041,8 +1035,8 @@ mod tests { use crate::{RemotePath, S3Bucket, S3Config}; - #[test] - fn relative_path() { + #[tokio::test] + async fn relative_path() { let all_paths = ["", "some/path", "some/path/"]; let all_paths: Vec = all_paths .iter() @@ -1085,8 +1079,9 @@ mod tests { max_keys_per_list_response: Some(5), upload_storage_class: None, }; - let storage = - S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init"); + let storage = S3Bucket::new(&config, std::time::Duration::ZERO) + .await + .expect("remote storage init"); for (test_path_idx, test_path) in all_paths.iter().enumerate() { let result = storage.relative_path_to_s3_object(test_path); let expected = expected_outputs[prefix_idx][test_path_idx]; diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index c467a2d196..67e5be2955 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -3,6 +3,7 @@ //! testing purposes. use bytes::Bytes; use futures::stream::Stream; +use futures::StreamExt; use std::collections::HashMap; use std::num::NonZeroU32; use std::sync::Mutex; @@ -107,6 +108,23 @@ impl UnreliableWrapper { type VoidStorage = crate::LocalFs; impl RemoteStorage for UnreliableWrapper { + fn list_streaming( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> { + async_stream::stream! { + self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) + .map_err(DownloadError::Other)?; + let mut stream = self.inner + .list_streaming(prefix, mode, max_keys, cancel); + while let Some(item) = stream.next().await { + yield item; + } + } + } async fn list( &self, prefix: Option<&RemotePath>, diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs index da9dc08d8d..daab05d91a 100644 --- a/libs/remote_storage/tests/common/mod.rs +++ b/libs/remote_storage/tests/common/mod.rs @@ -152,7 +152,7 @@ pub(crate) async fn upload_remote_data( let mut upload_tasks = JoinSet::new(); let cancel = CancellationToken::new(); - for i in 1..upload_tasks_count + 1 { + for i in 1..=upload_tasks_count { let task_client = Arc::clone(client); let cancel = cancel.clone(); diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index 673151c8ef..38c316397a 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,5 +1,6 @@ use anyhow::Context; use camino::Utf8Path; +use futures::StreamExt; use remote_storage::ListingMode; use remote_storage::RemotePath; use std::sync::Arc; @@ -29,10 +30,10 @@ use super::{ /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` /// -/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys. -/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, -/// since current default AWS S3 pagination limit is 1000. -/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax) +/// In the `MaybeEnabledStorageWithTestBlobs::setup`, we set the `max_keys_in_list_response` param to limit the keys in a single response. +/// This way, we are able to test the pagination, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, +/// as the current default AWS S3 pagination limit is 1000. +/// (see ). /// /// Lastly, the test attempts to clean up and remove all uploaded S3 files. /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. @@ -87,6 +88,41 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); + // list_streaming + + let prefix_with_slash = base_prefix.add_trailing_slash(); + let mut nested_remote_prefixes_st = test_client.list_streaming( + Some(&prefix_with_slash), + ListingMode::WithDelimiter, + None, + &cancel, + ); + let mut nested_remote_prefixes_combined = HashSet::new(); + let mut segments = 0; + let mut segment_max_size = 0; + while let Some(st) = nested_remote_prefixes_st.next().await { + let st = st?; + segment_max_size = segment_max_size.max(st.prefixes.len()); + nested_remote_prefixes_combined.extend(st.prefixes.into_iter()); + segments += 1; + } + assert!(segments > 1, "less than 2 segments: {segments}"); + assert!( + segment_max_size * 2 <= nested_remote_prefixes_combined.len(), + "double of segment_max_size={segment_max_size} larger number of remote prefixes of {}", + nested_remote_prefixes_combined.len() + ); + let remote_only_prefixes = nested_remote_prefixes_combined + .difference(&expected_remote_prefixes) + .collect::>(); + let missing_uploaded_prefixes = expected_remote_prefixes + .difference(&nested_remote_prefixes_combined) + .collect::>(); + assert_eq!( + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", + ); + Ok(()) } diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 23628dfebe..3a20649490 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -31,6 +31,7 @@ struct EnabledAzure { impl EnabledAzure { async fn setup(max_keys_in_list_response: Option) -> Self { let client = create_azure_client(max_keys_in_list_response) + .await .context("Azure client creation") .expect("Azure client creation failed"); @@ -187,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { } } -fn create_azure_client( +async fn create_azure_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { use rand::Rng; @@ -221,6 +222,8 @@ fn create_azure_client( timeout: Duration::from_secs(120), }; Ok(Arc::new( - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?, )) } diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index a273abe867..342bc6da0b 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -197,6 +197,7 @@ struct EnabledS3 { impl EnabledS3 { async fn setup(max_keys_in_list_response: Option) -> Self { let client = create_s3_client(max_keys_in_list_response) + .await .context("S3 client creation") .expect("S3 client creation failed"); @@ -352,7 +353,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { } } -fn create_s3_client( +async fn create_s3_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { use rand::Rng; @@ -385,7 +386,9 @@ fn create_s3_client( timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; Ok(Arc::new( - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?, )) } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 261ca2cc1a..ec05f849cf 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -20,7 +20,6 @@ bincode.workspace = true bytes.workspace = true camino.workspace = true chrono.workspace = true -heapless.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true hyper = { workspace = true, features = ["full"] } diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 03e65f74fe..a1170a460d 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -33,6 +33,10 @@ pub enum Scope { GenerationsApi, // Allows access to control plane managment API and some storage controller endpoints. Admin, + + /// Allows access to storage controller APIs used by the scrubber, to interrogate the state + /// of a tenant & post scrub results. + Scrubber, } /// JWT payload. See docs/authentication.md for the format diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs new file mode 100644 index 0000000000..720ea39d4f --- /dev/null +++ b/libs/utils/src/circuit_breaker.rs @@ -0,0 +1,114 @@ +use std::{ + fmt::Display, + time::{Duration, Instant}, +}; + +use metrics::IntCounter; + +/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly, +/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and +/// to mitigate the log spam from repeated failures. +pub struct CircuitBreaker { + /// An identifier that enables us to log useful errors when a circuit is broken + name: String, + + /// Consecutive failures since last success + fail_count: usize, + + /// How many consecutive failures before we break the circuit + fail_threshold: usize, + + /// If circuit is broken, when was it broken? + broken_at: Option, + + /// If set, we will auto-reset the circuit this long after it was broken. If None, broken + /// circuits stay broken forever, or until success() is called. + reset_period: Option, + + /// If this is true, no actual circuit-breaking happens. This is for overriding a circuit breaker + /// to permit something to keep running even if it would otherwise have tripped it. + short_circuit: bool, +} + +impl CircuitBreaker { + pub fn new(name: String, fail_threshold: usize, reset_period: Option) -> Self { + Self { + name, + fail_count: 0, + fail_threshold, + broken_at: None, + reset_period, + short_circuit: false, + } + } + + /// Construct an unbreakable circuit breaker, for use in unit tests etc. + pub fn short_circuit() -> Self { + Self { + name: String::new(), + fail_threshold: 0, + fail_count: 0, + broken_at: None, + reset_period: None, + short_circuit: true, + } + } + + pub fn fail(&mut self, metric: &IntCounter, error: E) + where + E: Display, + { + if self.short_circuit { + return; + } + + self.fail_count += 1; + if self.broken_at.is_none() && self.fail_count >= self.fail_threshold { + self.break_circuit(metric, error); + } + } + + /// Call this after successfully executing an operation + pub fn success(&mut self, metric: &IntCounter) { + self.fail_count = 0; + if let Some(broken_at) = &self.broken_at { + tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})", + humantime::format_duration(broken_at.elapsed())); + self.broken_at = None; + metric.inc(); + } + } + + /// Call this before attempting an operation, and skip the operation if we are currently broken. + pub fn is_broken(&mut self) -> bool { + if self.short_circuit { + return false; + } + + if let Some(broken_at) = self.broken_at { + match self.reset_period { + Some(reset_period) if broken_at.elapsed() > reset_period => { + self.reset_circuit(); + false + } + _ => true, + } + } else { + false + } + } + + fn break_circuit(&mut self, metric: &IntCounter, error: E) + where + E: Display, + { + self.broken_at = Some(Instant::now()); + tracing::error!(breaker=%self.name, "Circuit breaker broken! Last error: {error}"); + metric.inc(); + } + + fn reset_circuit(&mut self) { + self.broken_at = None; + self.fail_count = 0; + } +} diff --git a/libs/utils/src/history_buffer.rs b/libs/utils/src/history_buffer.rs deleted file mode 100644 index bd35e2bad6..0000000000 --- a/libs/utils/src/history_buffer.rs +++ /dev/null @@ -1,196 +0,0 @@ -//! A heapless buffer for events of sorts. - -use std::ops; - -use heapless::HistoryBuffer; - -#[derive(Debug, Clone)] -pub struct HistoryBufferWithDropCounter { - buffer: HistoryBuffer, - drop_count: u64, -} - -impl HistoryBufferWithDropCounter { - pub fn write(&mut self, data: T) { - let len_before = self.buffer.len(); - self.buffer.write(data); - let len_after = self.buffer.len(); - self.drop_count += u64::from(len_before == len_after); - } - pub fn drop_count(&self) -> u64 { - self.drop_count - } - pub fn map U>(&self, f: F) -> HistoryBufferWithDropCounter { - let mut buffer = HistoryBuffer::new(); - buffer.extend(self.buffer.oldest_ordered().map(f)); - HistoryBufferWithDropCounter:: { - buffer, - drop_count: self.drop_count, - } - } -} - -impl Default for HistoryBufferWithDropCounter { - fn default() -> Self { - Self { - buffer: HistoryBuffer::default(), - drop_count: 0, - } - } -} - -impl ops::Deref for HistoryBufferWithDropCounter { - type Target = HistoryBuffer; - - fn deref(&self) -> &Self::Target { - &self.buffer - } -} - -#[derive(serde::Serialize, serde::Deserialize)] -struct SerdeRepr { - buffer: Vec, - buffer_size: usize, - drop_count: u64, -} - -impl<'a, T, const L: usize> From<&'a HistoryBufferWithDropCounter> for SerdeRepr -where - T: Clone + serde::Serialize, -{ - fn from(value: &'a HistoryBufferWithDropCounter) -> Self { - let HistoryBufferWithDropCounter { buffer, drop_count } = value; - SerdeRepr { - buffer: buffer.iter().cloned().collect(), - buffer_size: L, - drop_count: *drop_count, - } - } -} - -impl serde::Serialize for HistoryBufferWithDropCounter -where - T: Clone + serde::Serialize, -{ - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - SerdeRepr::from(self).serialize(serializer) - } -} - -impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter -where - T: Clone + serde::Deserialize<'de>, -{ - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - let SerdeRepr { - buffer: des_buffer, - drop_count, - buffer_size, - } = SerdeRepr::::deserialize(deserializer)?; - if buffer_size != L { - use serde::de::Error; - return Err(D::Error::custom(format!( - "invalid buffer_size, expecting {L} got {buffer_size}" - ))); - } - let mut buffer = HistoryBuffer::new(); - buffer.extend(des_buffer); - Ok(HistoryBufferWithDropCounter { buffer, drop_count }) - } -} - -#[cfg(test)] -mod test { - use super::HistoryBufferWithDropCounter; - - #[test] - fn test_basics() { - let mut b = HistoryBufferWithDropCounter::::default(); - b.write(1); - b.write(2); - b.write(3); - assert!(b.iter().any(|e| *e == 2)); - assert!(b.iter().any(|e| *e == 3)); - assert!(!b.iter().any(|e| *e == 1)); - - // round-trip serde - let round_tripped: HistoryBufferWithDropCounter = - serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap(); - assert_eq!( - round_tripped.iter().cloned().collect::>(), - b.iter().cloned().collect::>() - ); - } - - #[test] - fn test_drop_count_works() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - b.write(1); - assert_eq!(b.drop_count(), 0); - b.write(2); - assert_eq!(b.drop_count(), 0); - b.write(3); - assert_eq!(b.drop_count(), 1); - b.write(4); - assert_eq!(b.drop_count(), 2); - } - - #[test] - fn test_clone_works() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - b.write(1); - b.write(2); - b.write(3); - assert_eq!(b.drop_count(), 1); - let mut c = b.clone(); - assert_eq!(c.drop_count(), 1); - assert!(c.iter().any(|e| *e == 2)); - assert!(c.iter().any(|e| *e == 3)); - assert!(!c.iter().any(|e| *e == 1)); - - c.write(4); - assert!(c.iter().any(|e| *e == 4)); - assert!(!b.iter().any(|e| *e == 4)); - } - - #[test] - fn test_map() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - - b.write(1); - assert_eq!(b.drop_count(), 0); - { - let c = b.map(|i| i + 10); - assert_eq!(c.oldest_ordered().cloned().collect::>(), vec![11]); - assert_eq!(c.drop_count(), 0); - } - - b.write(2); - assert_eq!(b.drop_count(), 0); - { - let c = b.map(|i| i + 10); - assert_eq!( - c.oldest_ordered().cloned().collect::>(), - vec![11, 12] - ); - assert_eq!(c.drop_count(), 0); - } - - b.write(3); - assert_eq!(b.drop_count(), 1); - { - let c = b.map(|i| i + 10); - assert_eq!( - c.oldest_ordered().cloned().collect::>(), - vec![12, 13] - ); - assert_eq!(c.drop_count(), 1); - } - } -} diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index f8a5f68131..8ee5abd434 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -52,17 +52,17 @@ struct RequestId(String); /// There could be other ways to implement similar functionality: /// /// * procmacros placed on top of all handler methods -/// With all the drawbacks of procmacros, brings no difference implementation-wise, -/// and little code reduction compared to the existing approach. +/// With all the drawbacks of procmacros, brings no difference implementation-wise, +/// and little code reduction compared to the existing approach. /// /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic, -/// implemented for [`RouterBuilder`]. -/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later. +/// implemented for [`RouterBuilder`]. +/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later. /// /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped -/// later, in a post-response middleware. -/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures` -/// tries to achive with its `.instrument` used in the current approach. +/// later, in a post-response middleware. +/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures` +/// tries to achive with its `.instrument` used in the current approach. /// /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced. pub async fn request_span(request: Request, handler: H) -> R::Output diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 0409001f4f..db468e3054 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -302,17 +302,6 @@ pub struct TenantId(Id); id_newtype!(TenantId); -/// Neon Connection Id identifies long-lived connections (for example a pagestream -/// connection with the page_service). Is used for better logging and tracing -/// -/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look -/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// See [`Id`] for alternative ways to serialize it. -#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] -pub struct ConnectionId(Id); - -id_newtype!(ConnectionId); - // A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct TenantTimelineId { diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 711e617801..a46d68ef33 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -59,8 +59,6 @@ pub mod signals; pub mod fs_ext; -pub mod history_buffer; - pub mod measured_stream; pub mod serde_percent; @@ -98,6 +96,8 @@ pub mod poison; pub mod toml_edit_ext; +pub mod circuit_breaker; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index 4f9ac6bdb4..f6b430657e 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -49,6 +49,7 @@ pub struct TenantShardId { impl ShardCount { pub const MAX: Self = Self(u8::MAX); + pub const MIN: Self = Self(0); /// The internal value of a ShardCount may be zero, which means "1 shard, but use /// legacy format for TenantShardId that excludes the shard suffix", also known diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index e3ddb446fa..ac3ff1bb89 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use bytes::Bytes; +use detach_ancestor::AncestorDetached; use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method, StatusCode}; use utils::{ @@ -418,6 +419,23 @@ impl Client { } } + pub async fn timeline_detach_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor", + self.mgmt_api_endpoint + ); + + self.request(Method::PUT, &uri, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 35519b5d0a..5bc9b5ca1d 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -131,7 +131,7 @@ impl CompactionKey for Key { pub type CompactionKeySpace = Vec>; /// Functions needed from all layers. -pub trait CompactionLayer { +pub trait CompactionLayer { fn key_range(&self) -> &Range; fn lsn_range(&self) -> &Range; diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index ea09a011e5..3fabf62987 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> { .get("remote_storage") .expect("need remote_storage"); let config = RemoteStorageConfig::from_toml(toml_item)?; - let storage = remote_storage::GenericRemoteStorage::from_config(&config); + let storage = remote_storage::GenericRemoteStorage::from_config(&config).await; let cancel = CancellationToken::new(); storage .unwrap() diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs index 4785c8c4c5..9e3dedb75a 100644 --- a/pageserver/src/auth.rs +++ b/pageserver/src/auth.rs @@ -14,12 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope - (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError( - format!( - "JWT scope '{:?}' is ineligible for Pageserver auth", - claims.scope - ) - .into(), - )), + (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => { + Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Pageserver auth", + claims.scope + ) + .into(), + )) + } } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9f705f0bc9..7a96c86ded 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -2,30 +2,35 @@ //! Main entry point for the Page Server executable. +use std::env; use std::env::{var, VarError}; use std::io::Read; use std::sync::Arc; use std::time::Duration; -use std::{env, ops::ControlFlow, str::FromStr}; use anyhow::{anyhow, Context}; use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; +use pageserver::config::PageserverIdentity; use pageserver::control_plane_client::ControlPlaneClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; use pageserver::tenant::{secondary, TenantSharedResources}; +use pageserver::{ + CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener, +}; use remote_storage::GenericRemoteStorage; use tokio::signal::unix::SignalKind; use tokio::time::Instant; +use tokio_util::sync::CancellationToken; use tracing::*; use metrics::set_build_info_metric; use pageserver::{ - config::{defaults::*, PageServerConf}, + config::PageServerConf, context::{DownloadBehavior, RequestContext}, deletion_queue::DeletionQueue, http, page_cache, page_service, task_mgr, @@ -84,18 +89,13 @@ fn main() -> anyhow::Result<()> { .with_context(|| format!("Error opening workdir '{workdir}'"))?; let cfg_file_path = workdir.join("pageserver.toml"); + let identity_file_path = workdir.join("identity.toml"); // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir) .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?; - let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? { - ControlFlow::Continue(conf) => conf, - ControlFlow::Break(()) => { - info!("Pageserver config init successful"); - return Ok(()); - } - }; + let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?; // Initialize logging. // @@ -150,70 +150,55 @@ fn main() -> anyhow::Result<()> { } fn initialize_config( + identity_file_path: &Utf8Path, cfg_file_path: &Utf8Path, - arg_matches: clap::ArgMatches, workdir: &Utf8Path, -) -> anyhow::Result> { - let init = arg_matches.get_flag("init"); - - let file_contents: Option = match std::fs::File::open(cfg_file_path) { +) -> anyhow::Result<&'static PageServerConf> { + // The deployment orchestrator writes out an indentity file containing the node id + // for all pageservers. This file is the source of truth for the node id. In order + // to allow for rolling back pageserver releases, the node id is also included in + // the pageserver config that the deployment orchestrator writes to disk for the pageserver. + // A rolled back version of the pageserver will get the node id from the pageserver.toml + // config file. + let identity = match std::fs::File::open(identity_file_path) { Ok(mut f) => { - if init { - anyhow::bail!("config file already exists: {cfg_file_path}"); + let md = f.metadata().context("stat config file")?; + if !md.is_file() { + anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..."); } + + let mut s = String::new(); + f.read_to_string(&mut s).context("read identity file")?; + toml_edit::de::from_str::(&s)? + } + Err(e) => { + anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..."); + } + }; + + let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) { + Ok(mut f) => { let md = f.metadata().context("stat config file")?; if md.is_file() { let mut s = String::new(); f.read_to_string(&mut s).context("read config file")?; - Some(s.parse().context("parse config file toml")?) + s.parse().context("parse config file toml")? } else { anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}"); } } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => None, Err(e) => { anyhow::bail!("open pageserver config: {e}: {cfg_file_path}"); } }; - let mut effective_config = file_contents.unwrap_or_else(|| { - DEFAULT_CONFIG_FILE - .parse() - .expect("unit tests ensure this works") - }); - - // Patch with overrides from the command line - if let Some(values) = arg_matches.get_many::("config-override") { - for option_line in values { - let doc = toml_edit::Document::from_str(option_line).with_context(|| { - format!("Option '{option_line}' could not be parsed as a toml document") - })?; - - for (key, item) in doc.iter() { - effective_config.insert(key, item.clone()); - } - } - } - - debug!("Resulting toml: {effective_config}"); + debug!("Using pageserver toml: {config}"); // Construct the runtime representation - let conf = PageServerConf::parse_and_validate(&effective_config, workdir) + let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir) .context("Failed to parse pageserver configuration")?; - if init { - info!("Writing pageserver config to '{cfg_file_path}'"); - - std::fs::write(cfg_file_path, effective_config.to_string()) - .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?; - info!("Config successfully written to '{cfg_file_path}'") - } - - Ok(if init { - ControlFlow::Break(()) - } else { - ControlFlow::Continue(Box::leak(Box::new(conf))) - }) + Ok(Box::leak(Box::new(conf))) } struct WaitForPhaseResult { @@ -305,6 +290,7 @@ fn start_pageserver( // Create and lock PID file. This ensures that there cannot be more than one // pageserver process running at the same time. let lock_file_path = conf.workdir.join(PID_FILE_NAME); + info!("Claiming pid file at {lock_file_path:?}..."); let lock_file = utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; info!("Claimed pid file at {lock_file_path:?}"); @@ -385,7 +371,7 @@ fn start_pageserver( let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); // Set up remote storage client - let remote_storage = create_remote_storage_client(conf)?; + let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?; // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( @@ -430,8 +416,10 @@ fn start_pageserver( // Scan the local 'tenants/' directory and start loading the tenants let deletion_queue_client = deletion_queue.new_client(); + let background_purges = mgr::BackgroundPurges::default(); let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( conf, + background_purges.clone(), TenantSharedResources { broker_client: broker_client.clone(), remote_storage: remote_storage.clone(), @@ -523,7 +511,7 @@ fn start_pageserver( } }); - let secondary_controller = secondary::spawn_tasks( + let (secondary_controller, secondary_controller_tasks) = secondary::spawn_tasks( tenant_manager.clone(), remote_storage.clone(), background_jobs_barrier.clone(), @@ -536,18 +524,19 @@ fn start_pageserver( // been configured. let disk_usage_eviction_state: Arc = Arc::default(); - launch_disk_usage_global_eviction_task( + let disk_usage_eviction_task = launch_disk_usage_global_eviction_task( conf, remote_storage.clone(), disk_usage_eviction_state.clone(), tenant_manager.clone(), background_jobs_barrier.clone(), - )?; + ); // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. - { - let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); + let http_endpoint_listener = { + let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper + let cancel = CancellationToken::new(); let router_state = Arc::new( http::routes::State::new( @@ -568,78 +557,44 @@ fn start_pageserver( let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) - .with_graceful_shutdown(task_mgr::shutdown_watcher()); + .with_graceful_shutdown({ + let cancel = cancel.clone(); + async move { cancel.clone().cancelled().await } + }); - task_mgr::spawn( - MGMT_REQUEST_RUNTIME.handle(), - TaskKind::HttpEndpointListener, - None, - None, + let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "http endpoint listener", - true, - async { - server.await?; - Ok(()) - }, - ); - } + server, + )); + HttpEndpointListener(CancellableTask { task, cancel }) + }; - if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint { - let metrics_ctx = RequestContext::todo_child( - TaskKind::MetricsCollection, - // This task itself shouldn't download anything. - // The actual size calculation does need downloads, and - // creates a child context with the right DownloadBehavior. - DownloadBehavior::Error, - ); + let consumption_metrics_tasks = { + let cancel = shutdown_pageserver.child_token(); + let task = crate::BACKGROUND_RUNTIME.spawn({ + let tenant_manager = tenant_manager.clone(); + let cancel = cancel.clone(); + async move { + // first wait until background jobs are cleared to launch. + // + // this is because we only process active tenants and timelines, and the + // Timeline::get_current_logical_size will spawn the logical size calculation, + // which will not be rate-limited. + tokio::select! { + _ = cancel.cancelled() => { return; }, + _ = background_jobs_barrier.wait() => {} + }; - let local_disk_storage = conf.workdir.join("last_consumption_metrics.json"); - - task_mgr::spawn( - crate::BACKGROUND_RUNTIME.handle(), - TaskKind::MetricsCollection, - None, - None, - "consumption metrics collection", - true, - { - let tenant_manager = tenant_manager.clone(); - async move { - // first wait until background jobs are cleared to launch. - // - // this is because we only process active tenants and timelines, and the - // Timeline::get_current_logical_size will spawn the logical size calculation, - // which will not be rate-limited. - let cancel = task_mgr::shutdown_token(); - - tokio::select! { - _ = cancel.cancelled() => { return Ok(()); }, - _ = background_jobs_barrier.wait() => {} - }; - - pageserver::consumption_metrics::collect_metrics( - tenant_manager, - metric_collection_endpoint, - &conf.metric_collection_bucket, - conf.metric_collection_interval, - conf.cached_metric_collection_interval, - conf.synthetic_size_calculation_interval, - conf.id, - local_disk_storage, - cancel, - metrics_ctx, - ) - .instrument(info_span!("metrics_collection")) - .await?; - Ok(()) - } - }, - ); - } + pageserver::consumption_metrics::run(conf, tenant_manager, cancel).await; + } + }); + ConsumptionMetricsTasks(CancellableTask { task, cancel }) + }; // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. We created the listener earlier already. - { + let libpq_listener = { + let cancel = CancellationToken::new(); let libpq_ctx = RequestContext::todo_child( TaskKind::LibpqEndpointListener, // listener task shouldn't need to download anything. (We will @@ -648,29 +603,20 @@ fn start_pageserver( // accept connections.) DownloadBehavior::Error, ); - task_mgr::spawn( - COMPUTE_REQUEST_RUNTIME.handle(), - TaskKind::LibpqEndpointListener, - None, - None, - "libpq endpoint listener", - true, - { - let tenant_manager = tenant_manager.clone(); - async move { - page_service::libpq_listener_main( - tenant_manager, - pg_auth, - pageserver_listener, - conf.pg_auth_type, - libpq_ctx, - task_mgr::shutdown_token(), - ) - .await - } - }, - ); - } + + let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "libpq listener", + page_service::libpq_listener_main( + tenant_manager.clone(), + pg_auth, + pageserver_listener, + conf.pg_auth_type, + libpq_ctx, + cancel.clone(), + ), + )); + LibpqEndpointListener(CancellableTask { task, cancel }) + }; let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); @@ -696,13 +642,24 @@ fn start_pageserver( // Right now that tree doesn't reach very far, and `task_mgr` is used instead. // The plan is to change that over time. shutdown_pageserver.take(); - pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await; + pageserver::shutdown_pageserver( + http_endpoint_listener, + libpq_listener, + consumption_metrics_tasks, + disk_usage_eviction_task, + &tenant_manager, + background_purges, + deletion_queue.clone(), + secondary_controller_tasks, + 0, + ) + .await; unreachable!() }) } } -fn create_remote_storage_client( +async fn create_remote_storage_client( conf: &'static PageServerConf, ) -> anyhow::Result { let config = if let Some(config) = &conf.remote_storage_config { @@ -712,7 +669,7 @@ fn create_remote_storage_client( }; // Create the client - let mut remote_storage = GenericRemoteStorage::from_config(config)?; + let mut remote_storage = GenericRemoteStorage::from_config(config).await?; // If `test_remote_failures` is non-zero, wrap the client with a // wrapper that simulates failures. @@ -735,28 +692,12 @@ fn cli() -> Command { Command::new("Neon page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(version()) - .arg( - Arg::new("init") - .long("init") - .action(ArgAction::SetTrue) - .help("Initialize pageserver with all given config overrides"), - ) .arg( Arg::new("workdir") .short('D') .long("workdir") .help("Working directory for the pageserver"), ) - // See `settings.md` for more details on the extra configuration patameters pageserver can process - .arg( - Arg::new("config-override") - .long("config-override") - .short('c') - .num_args(1) - .action(ArgAction::Append) - .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \ - Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), - ) .arg( Arg::new("enabled-features") .long("enabled-features") diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 17bc427b2c..20e78b1d85 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -7,12 +7,11 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId}; use remote_storage::{RemotePath, RemoteStorageConfig}; -use serde; use serde::de::IntoDeserializer; +use serde::{self, Deserialize}; use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; -use utils::id::ConnectionId; use utils::logging::SecretString; use once_cell::sync::OnceCell; @@ -69,7 +68,6 @@ pub mod defaults { super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; - pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s"; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; @@ -124,7 +122,6 @@ pub mod defaults { #concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}' #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}' -#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} @@ -239,7 +236,6 @@ pub struct PageServerConf { // How often to collect metrics and send them to the metrics endpoint. pub metric_collection_interval: Duration, // How often to send unchanged cached metrics to the metrics endpoint. - pub cached_metric_collection_interval: Duration, pub metric_collection_endpoint: Option, pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, @@ -371,7 +367,6 @@ struct PageServerConfigBuilder { concurrent_tenant_size_logical_size_queries: BuilderValue, metric_collection_interval: BuilderValue, - cached_metric_collection_interval: BuilderValue, metric_collection_endpoint: BuilderValue>, synthetic_size_calculation_interval: BuilderValue, metric_collection_bucket: BuilderValue>, @@ -411,6 +406,13 @@ struct PageServerConfigBuilder { } impl PageServerConfigBuilder { + fn new(node_id: NodeId) -> Self { + let mut this = Self::default(); + this.id(node_id); + + this + } + #[inline(always)] fn default_values() -> Self { use self::BuilderValue::*; @@ -455,10 +457,6 @@ impl PageServerConfigBuilder { DEFAULT_METRIC_COLLECTION_INTERVAL, ) .expect("cannot parse default metric collection interval")), - cached_metric_collection_interval: Set(humantime::parse_duration( - DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL, - ) - .expect("cannot parse default cached_metric_collection_interval")), synthetic_size_calculation_interval: Set(humantime::parse_duration( DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, ) @@ -590,14 +588,6 @@ impl PageServerConfigBuilder { self.metric_collection_interval = BuilderValue::Set(metric_collection_interval) } - pub fn cached_metric_collection_interval( - &mut self, - cached_metric_collection_interval: Duration, - ) { - self.cached_metric_collection_interval = - BuilderValue::Set(cached_metric_collection_interval) - } - pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option) { self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) } @@ -731,7 +721,6 @@ impl PageServerConfigBuilder { broker_keepalive_interval, log_format, metric_collection_interval, - cached_metric_collection_interval, metric_collection_endpoint, metric_collection_bucket, synthetic_size_calculation_interval, @@ -870,22 +859,6 @@ impl PageServerConf { ) } - pub fn traces_path(&self) -> Utf8PathBuf { - self.workdir.join("traces") - } - - pub fn trace_path( - &self, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, - connection_id: &ConnectionId, - ) -> Utf8PathBuf { - self.traces_path() - .join(tenant_shard_id.to_string()) - .join(timeline_id.to_string()) - .join(connection_id.to_string()) - } - /// Turns storage remote path of a file into its local path. pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf { remote_path.with_base(&self.workdir) @@ -915,8 +888,12 @@ impl PageServerConf { /// validating the input and failing on errors. /// /// This leaves any options not present in the file in the built-in defaults. - pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result { - let mut builder = PageServerConfigBuilder::default(); + pub fn parse_and_validate( + node_id: NodeId, + toml: &Document, + workdir: &Utf8Path, + ) -> anyhow::Result { + let mut builder = PageServerConfigBuilder::new(node_id); builder.workdir(workdir.to_owned()); let mut t_conf = TenantConfOpt::default(); @@ -947,7 +924,8 @@ impl PageServerConf { "tenant_config" => { t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; } - "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), + "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth + // Logging is not set up yet, so we can't do it. "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), "log_format" => builder.log_format( @@ -964,7 +942,6 @@ impl PageServerConf { NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? }), "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?), - "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?), "metric_collection_endpoint" => { let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; builder.metric_collection_endpoint(Some(endpoint)); @@ -1097,7 +1074,6 @@ impl PageServerConf { eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default( ), metric_collection_interval: Duration::from_secs(60), - cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(60), @@ -1126,6 +1102,12 @@ impl PageServerConf { } } +#[derive(Deserialize)] +#[serde(deny_unknown_fields)] +pub struct PageserverIdentity { + pub id: NodeId, +} + // Helper functions to parse a toml Item fn parse_toml_string(name: &str, item: &Item) -> Result { @@ -1276,7 +1258,6 @@ initial_superuser_name = 'zzzz' id = 10 metric_collection_interval = '222 s' -cached_metric_collection_interval = '22200 s' metric_collection_endpoint = 'http://localhost:80/metrics' synthetic_size_calculation_interval = '333 s' @@ -1296,7 +1277,7 @@ background_task_maximum_delay = '334 s' ); let toml = config_string.parse()?; - let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) + let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( @@ -1332,9 +1313,6 @@ background_task_maximum_delay = '334 s' metric_collection_interval: humantime::parse_duration( defaults::DEFAULT_METRIC_COLLECTION_INTERVAL )?, - cached_metric_collection_interval: humantime::parse_duration( - defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL - )?, metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, metric_collection_bucket: None, synthetic_size_calculation_interval: humantime::parse_duration( @@ -1381,7 +1359,7 @@ background_task_maximum_delay = '334 s' ); let toml = config_string.parse()?; - let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) + let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( @@ -1413,7 +1391,6 @@ background_task_maximum_delay = '334 s' eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(), metric_collection_interval: Duration::from_secs(222), - cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(333), @@ -1472,12 +1449,13 @@ broker_endpoint = '{broker_endpoint}' let toml = config_string.parse()?; - let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for the local FS"); + let parsed_remote_storage_config = + PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) + .unwrap_or_else(|e| { + panic!("Failed to parse config '{config_string}', reason: {e:?}") + }) + .remote_storage_config + .expect("Should have remote storage config for the local FS"); assert_eq!( parsed_remote_storage_config, @@ -1533,12 +1511,13 @@ broker_endpoint = '{broker_endpoint}' let toml = config_string.parse()?; - let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for S3"); + let parsed_remote_storage_config = + PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) + .unwrap_or_else(|e| { + panic!("Failed to parse config '{config_string}', reason: {e:?}") + }) + .remote_storage_config + .expect("Should have remote storage config for S3"); assert_eq!( parsed_remote_storage_config, @@ -1560,34 +1539,6 @@ broker_endpoint = '{broker_endpoint}' Ok(()) } - #[test] - fn parse_tenant_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - - let broker_endpoint = "http://127.0.0.1:7777"; - let trace_read_requests = true; - - let config_string = format!( - r#"{ALL_BASE_VALUES_TOML} -pg_distrib_dir='{pg_distrib_dir}' -broker_endpoint = '{broker_endpoint}' - -[tenant_config] -trace_read_requests = {trace_read_requests}"#, - ); - - let toml = config_string.parse()?; - - let conf = PageServerConf::parse_and_validate(&toml, &workdir)?; - assert_eq!( - conf.default_tenant_conf.trace_read_requests, trace_read_requests, - "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants", - ); - - Ok(()) - } - #[test] fn parse_incorrect_tenant_config() -> anyhow::Result<()> { let config_string = r#" @@ -1645,7 +1596,7 @@ threshold = "20m" "#, ); let toml: Document = pageserver_conf_toml.parse()?; - let conf = PageServerConf::parse_and_validate(&toml, &workdir)?; + let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?; assert_eq!(conf.pg_distrib_dir, pg_distrib_dir); assert_eq!( @@ -1661,7 +1612,11 @@ threshold = "20m" .evictions_low_residence_duration_metric_threshold, Duration::from_secs(20 * 60) ); - assert_eq!(conf.id, NodeId(222)); + + // Assert that the node id provided by the indentity file (threaded + // through the call to [`PageServerConf::parse_and_validate`] is + // used. + assert_eq!(conf.id, NodeId(333)); assert_eq!( conf.disk_usage_based_eviction, Some(DiskUsageEvictionTaskConfig { @@ -1670,7 +1625,7 @@ threshold = "20m" period: Duration::from_secs(10), #[cfg(feature = "testing")] mock_statvfs: None, - eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed, + eviction_order: Default::default(), }) ); @@ -1706,7 +1661,7 @@ threshold = "20m" "#, ); let toml: Document = pageserver_conf_toml.parse().unwrap(); - let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap(); + let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap(); match &conf.default_tenant_conf.eviction_policy { EvictionPolicy::OnlyImitiate(t) => { @@ -1725,7 +1680,7 @@ threshold = "20m" remote_storage = {} "#; let doc = toml_edit::Document::from_str(input).unwrap(); - let err = PageServerConf::parse_and_validate(&doc, &workdir) + let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir) .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage"); assert!(format!("{err}").contains("remote_storage"), "{err}"); } diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 18c1a6cd9b..f94d945d46 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,5 +1,6 @@ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. +use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::size::CalculateSyntheticSizeError; @@ -39,56 +40,74 @@ type RawMetric = (MetricsKey, (EventType, u64)); /// for deduplication, but that is no longer needed. type Cache = HashMap; +pub async fn run( + conf: &'static PageServerConf, + tenant_manager: Arc, + cancel: CancellationToken, +) { + let Some(metric_collection_endpoint) = conf.metric_collection_endpoint.as_ref() else { + return; + }; + + let local_disk_storage = conf.workdir.join("last_consumption_metrics.json"); + + let metrics_ctx = RequestContext::todo_child( + TaskKind::MetricsCollection, + // This task itself shouldn't download anything. + // The actual size calculation does need downloads, and + // creates a child context with the right DownloadBehavior. + DownloadBehavior::Error, + ); + let collect_metrics = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "consumption metrics collection", + collect_metrics( + tenant_manager.clone(), + metric_collection_endpoint, + &conf.metric_collection_bucket, + conf.metric_collection_interval, + conf.id, + local_disk_storage, + cancel.clone(), + metrics_ctx, + ) + .instrument(info_span!("metrics_collection")), + )); + + let worker_ctx = + RequestContext::todo_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); + let synthetic_size_worker = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "synthetic size calculation", + calculate_synthetic_size_worker( + tenant_manager.clone(), + conf.synthetic_size_calculation_interval, + cancel.clone(), + worker_ctx, + ) + .instrument(info_span!("synthetic_size_worker")), + )); + + let (collect_metrics, synthetic_size_worker) = + futures::future::join(collect_metrics, synthetic_size_worker).await; + collect_metrics + .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process"); + synthetic_size_worker + .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process"); +} + /// Main thread that serves metrics collection #[allow(clippy::too_many_arguments)] -pub async fn collect_metrics( +async fn collect_metrics( tenant_manager: Arc, metric_collection_endpoint: &Url, metric_collection_bucket: &Option, metric_collection_interval: Duration, - _cached_metric_collection_interval: Duration, - synthetic_size_calculation_interval: Duration, node_id: NodeId, local_disk_storage: Utf8PathBuf, cancel: CancellationToken, ctx: RequestContext, ) -> anyhow::Result<()> { - if _cached_metric_collection_interval != Duration::ZERO { - tracing::warn!( - "cached_metric_collection_interval is no longer used, please set it to zero." - ) - } - - // spin up background worker that caclulates tenant sizes - let worker_ctx = - ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::CalculateSyntheticSize, - None, - None, - "synthetic size calculation", - false, - { - let tenant_manager = tenant_manager.clone(); - async move { - calculate_synthetic_size_worker( - tenant_manager, - synthetic_size_calculation_interval, - &cancel, - &worker_ctx, - ) - .instrument(info_span!("synthetic_size_worker")) - .await?; - Ok(()) - } - }, - ); - let path: Arc = Arc::new(local_disk_storage); - let cancel = task_mgr::shutdown_token(); - let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval); let mut cached_metrics = tokio::select! { @@ -103,7 +122,7 @@ pub async fn collect_metrics( .expect("Failed to create http client with timeout"); let bucket_client = if let Some(bucket_config) = metric_collection_bucket { - match GenericRemoteStorage::from_config(bucket_config) { + match GenericRemoteStorage::from_config(bucket_config).await { Ok(client) => Some(client), Err(e) => { // Non-fatal error: if we were given an invalid config, we will proceed @@ -175,11 +194,9 @@ pub async fn collect_metrics( BackgroundLoopKind::ConsumptionMetricsCollectMetrics, ); - let res = tokio::time::timeout_at( - started_at + metric_collection_interval, - task_mgr::shutdown_token().cancelled(), - ) - .await; + let res = + tokio::time::timeout_at(started_at + metric_collection_interval, cancel.cancelled()) + .await; if res.is_ok() { return Ok(()); } @@ -279,8 +296,8 @@ async fn reschedule( async fn calculate_synthetic_size_worker( tenant_manager: Arc, synthetic_size_calculation_interval: Duration, - cancel: &CancellationToken, - ctx: &RequestContext, + cancel: CancellationToken, + ctx: RequestContext, ) -> anyhow::Result<()> { info!("starting calculate_synthetic_size_worker"); scopeguard::defer! { @@ -320,7 +337,7 @@ async fn calculate_synthetic_size_worker( // there is never any reason to exit calculate_synthetic_size_worker following any // return value -- we don't need to care about shutdown because no tenant is found when // pageserver is shut down. - calculate_and_log(&tenant, cancel, ctx).await; + calculate_and_log(&tenant, &cancel, &ctx).await; } crate::tenant::tasks::warn_when_period_overrun( diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 86d0390c30..0b07e07524 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -59,6 +59,7 @@ //! 1. It should be easy to forward the context to callees. //! 2. To propagate more data from high-level to low-level code, the functions in //! the middle should not need to be modified. +//! //! The solution is to have a container structure ([`RequestContext`]) that //! carries the information. Functions that don't care about what's in it //! pass it along to callees. diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 3e48552ace..22f7d5b824 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -828,9 +828,9 @@ mod test { } } - fn setup(test_name: &str) -> anyhow::Result { + async fn setup(test_name: &str) -> anyhow::Result { let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}"))); - let harness = TenantHarness::create(test_name)?; + let harness = TenantHarness::create(test_name).await?; // We do not load() the harness: we only need its config and remote_storage @@ -844,7 +844,9 @@ mod test { }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; - let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); + let storage = GenericRemoteStorage::from_config(&storage_config) + .await + .unwrap(); let mock_control_plane = MockControlPlane::new(); @@ -922,7 +924,9 @@ mod test { #[tokio::test] async fn deletion_queue_smoke() -> anyhow::Result<()> { // Basic test that the deletion queue processes the deletions we pass into it - let ctx = setup("deletion_queue_smoke").expect("Failed test setup"); + let ctx = setup("deletion_queue_smoke") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; @@ -992,7 +996,9 @@ mod test { #[tokio::test] async fn deletion_queue_validation() -> anyhow::Result<()> { - let ctx = setup("deletion_queue_validation").expect("Failed test setup"); + let ctx = setup("deletion_queue_validation") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; @@ -1051,7 +1057,9 @@ mod test { #[tokio::test] async fn deletion_queue_recovery() -> anyhow::Result<()> { // Basic test that the deletion queue processes the deletions we pass into it - let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup"); + let mut ctx = setup("deletion_queue_recovery") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 90bd4294bb..92dcf6ee61 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -59,13 +59,14 @@ use utils::{completion, id::TimelineId}; use crate::{ config::PageServerConf, metrics::disk_usage_based_eviction::METRICS, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + task_mgr::{self, BACKGROUND_RUNTIME}, tenant::{ mgr::TenantManager, remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName}, }, + CancellableTask, DiskUsageEvictionTask, }; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -83,17 +84,9 @@ pub struct DiskUsageEvictionTaskConfig { /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. -#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(tag = "type", content = "args")] pub enum EvictionOrder { - /// Order the layers to be evicted by how recently they have been accessed in absolute - /// time. - /// - /// This strategy is unfair when some tenants grow faster than others towards the slower - /// growing. - #[default] - AbsoluteAccessed, - /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. RelativeAccessed { @@ -108,6 +101,14 @@ pub enum EvictionOrder { }, } +impl Default for EvictionOrder { + fn default() -> Self { + Self::RelativeAccessed { + highest_layer_count_loses_first: true, + } + } +} + fn default_highest_layer_count_loses_first() -> bool { true } @@ -117,11 +118,6 @@ impl EvictionOrder { use EvictionOrder::*; match self { - AbsoluteAccessed => { - candidates.sort_unstable_by_key(|(partition, candidate)| { - (*partition, candidate.last_activity_ts) - }); - } RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| { (*partition, candidate.relative_last_activity) }), @@ -134,7 +130,6 @@ impl EvictionOrder { use EvictionOrder::*; match self { - AbsoluteAccessed => finite_f32::FiniteF32::ZERO, RelativeAccessed { highest_layer_count_loses_first, } => { @@ -192,36 +187,34 @@ pub fn launch_disk_usage_global_eviction_task( state: Arc, tenant_manager: Arc, background_jobs_barrier: completion::Barrier, -) -> anyhow::Result<()> { +) -> Option { let Some(task_config) = &conf.disk_usage_based_eviction else { info!("disk usage based eviction task not configured"); - return Ok(()); + return None; }; info!("launching disk usage based eviction task"); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::DiskUsageEviction, - None, - None, + let cancel = CancellationToken::new(); + let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "disk usage based eviction", - false, - async move { - let cancel = task_mgr::shutdown_token(); + { + let cancel = cancel.clone(); + async move { + // wait until initial load is complete, because we cannot evict from loading tenants. + tokio::select! { + _ = cancel.cancelled() => { return anyhow::Ok(()); }, + _ = background_jobs_barrier.wait() => { } + }; - // wait until initial load is complete, because we cannot evict from loading tenants. - tokio::select! { - _ = cancel.cancelled() => { return Ok(()); }, - _ = background_jobs_barrier.wait() => { } - }; - - disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await; - Ok(()) + disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel) + .await; + anyhow::Ok(()) + } }, - ); + )); - Ok(()) + Some(DiskUsageEvictionTask(CancellableTask { cancel, task })) } #[instrument(skip_all)] diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 5ba329f05e..59e646d0ca 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -377,7 +377,7 @@ paths: schema: $ref: "#/components/schemas/ConflictError" - /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive: + /v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive: parameters: - name: tenant_id in: path @@ -397,6 +397,51 @@ paths: "202": description: Tenant scheduled to load successfully + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + put: + description: | + Either archives or unarchives the given timeline. + An archived timeline may not have any non-archived children. + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/ArchivalConfigRequest" + responses: + "200": + description: Timeline (un)archived successfully + "409": + description: | + The tenant/timeline is already being modified, perhaps by a concurrent call to this API + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "503": + description: Temporarily unavailable, please retry. + content: + application/json: + schema: + $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/{tenant_id}/synthetic_size: parameters: - name: tenant_id @@ -429,7 +474,9 @@ paths: schema: $ref: "#/components/schemas/SyntheticSizeResponse" text/html: - description: SVG representation of the tenant and it's timelines. + schema: + type: string + description: SVG representation of the tenant and its timelines. "401": description: Unauthorized Error content: @@ -568,7 +615,7 @@ paths: type: string - name: timeline_id in: path - Å•equired: true + required: true schema: type: string @@ -774,15 +821,13 @@ components: TenantCreateRequest: allOf: - $ref: '#/components/schemas/TenantConfig' + - $ref: '#/components/schemas/TenantLoadRequest' - type: object required: - new_tenant_id properties: new_tenant_id: type: string - generation: - type: integer - description: Attachment generation number. TenantLoadRequest: type: object properties: @@ -846,6 +891,15 @@ components: warm: type: boolean description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything. + ArchivalConfigRequest: + type: object + required + - state + properties: + state: + description: The archival state of a timeline + type: string + enum: ["Archived", "Unarchived"] TenantConfig: type: object properties: @@ -873,8 +927,6 @@ components: type: string max_lsn_wal_lag: type: integer - trace_read_requests: - type: boolean heatmap_period: type: string TenantConfigResponse: @@ -1108,7 +1160,7 @@ components: reparented_timelines: type: array description: Set of reparented timeline ids - properties: + items: type: string format: hex description: TimelineId diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 6f8f3e6389..d63c240365 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -18,14 +18,17 @@ use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; use pageserver_api::models::IngestAuxFilesRequest; use pageserver_api::models::ListAuxFilesRequest; use pageserver_api::models::LocationConfig; use pageserver_api::models::LocationConfigListResponse; +use pageserver_api::models::LocationConfigMode; use pageserver_api::models::LsnLease; use pageserver_api::models::LsnLeaseRequest; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; +use pageserver_api::models::TenantLocationConfigRequest; use pageserver_api::models::TenantLocationConfigResponse; use pageserver_api::models::TenantScanRemoteStorageResponse; use pageserver_api::models::TenantScanRemoteStorageShard; @@ -33,12 +36,10 @@ use pageserver_api::models::TenantShardLocation; use pageserver_api::models::TenantShardSplitRequest; use pageserver_api::models::TenantShardSplitResponse; use pageserver_api::models::TenantSorting; +use pageserver_api::models::TimelineArchivalConfigRequest; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::TopTenantShardsRequest; use pageserver_api::models::TopTenantShardsResponse; -use pageserver_api::models::{ - DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest, -}; use pageserver_api::shard::ShardCount; use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; @@ -664,6 +665,39 @@ async fn timeline_preserve_initdb_handler( json_response(StatusCode::OK, ()) } +async fn timeline_archival_config_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant + .apply_timeline_archival_config(timeline_id, request_data.state) + .await + .context("applying archival config") + .map_err(ApiError::InternalServerError)?; + Ok::<_, ApiError>(()) + } + .instrument(info_span!("timeline_archival_config", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + state = ?request_data.state, + %timeline_id)) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, @@ -1642,6 +1676,10 @@ async fn timeline_checkpoint_handler( if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { flags |= CompactFlags::ForceImageLayerCreation; } + + // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload. + let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true); + let wait_until_uploaded = parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); @@ -1658,15 +1696,17 @@ async fn timeline_checkpoint_handler( } })?; - timeline - .compact(&cancel, flags, &ctx) - .await - .map_err(|e| - match e { - CompactionError::ShuttingDown => ApiError::ShuttingDown, - CompactionError::Other(e) => ApiError::InternalServerError(e) - } - )?; + if compact { + timeline + .compact(&cancel, flags, &ctx) + .await + .map_err(|e| + match e { + CompactionError::ShuttingDown => ApiError::ShuttingDown, + CompactionError::Other(e) => ApiError::InternalServerError(e) + } + )?; + } if wait_until_uploaded { timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; @@ -1721,7 +1761,9 @@ async fn timeline_detach_ancestor_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - use crate::tenant::timeline::detach_ancestor::Options; + use crate::tenant::timeline::detach_ancestor; + use pageserver_api::models::detach_ancestor::AncestorDetached; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -1729,7 +1771,7 @@ async fn timeline_detach_ancestor_handler( let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); async move { - let mut options = Options::default(); + let mut options = detach_ancestor::Options::default(); let rewrite_concurrency = parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?; @@ -1757,27 +1799,36 @@ async fn timeline_detach_ancestor_handler( let timeline = tenant.get_timeline(timeline_id, true)?; - let (_guard, prepared) = timeline + let progress = timeline .prepare_to_detach_from_ancestor(&tenant, options, ctx) .await?; - let res = state - .tenant_manager - .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx) - .await; + // uncomment to allow early as possible Tenant::drop + // drop(tenant); - match res { - Ok(reparented_timelines) => { - let resp = pageserver_api::models::detach_ancestor::AncestorDetached { + let resp = match progress { + detach_ancestor::Progress::Prepared(_guard, prepared) => { + // it would be great to tag the guard on to the tenant activation future + let reparented_timelines = state + .tenant_manager + .complete_detaching_timeline_ancestor( + tenant_shard_id, + timeline_id, + prepared, + ctx, + ) + .await + .context("timeline detach ancestor completion") + .map_err(ApiError::InternalServerError)?; + + AncestorDetached { reparented_timelines, - }; - - json_response(StatusCode::OK, resp) + } } - Err(e) => Err(ApiError::InternalServerError( - e.context("timeline detach completion"), - )), - } + detach_ancestor::Progress::Done(resp) => resp, + }; + + json_response(StatusCode::OK, resp) } .instrument(span) .await @@ -2778,6 +2829,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive", |r| api_handler(r, timeline_preserve_initdb_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config", + |r| api_handler(r, timeline_archival_config_handler), + ) .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_detail_handler) }) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index ac6b9b4f2a..d944019641 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -13,6 +13,7 @@ pub mod http; pub mod import_datadir; pub mod l0_flush; pub use pageserver_api::keyspace; +use tokio_util::sync::CancellationToken; pub mod aux_file; pub mod metrics; pub mod page_cache; @@ -23,7 +24,6 @@ pub mod span; pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; -pub mod trace; pub mod utilization; pub mod virtual_file; pub mod walingest; @@ -33,7 +33,10 @@ pub mod walredo; use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; -use tenant::mgr::TenantManager; +use tenant::{ + mgr::{BackgroundPurges, TenantManager}, + secondary, +}; use tracing::info; /// Current storage format version @@ -55,17 +58,39 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; +pub struct CancellableTask { + pub task: tokio::task::JoinHandle<()>, + pub cancel: CancellationToken, +} +pub struct HttpEndpointListener(pub CancellableTask); +pub struct LibpqEndpointListener(pub CancellableTask); +pub struct ConsumptionMetricsTasks(pub CancellableTask); +pub struct DiskUsageEvictionTask(pub CancellableTask); +impl CancellableTask { + pub async fn shutdown(self) { + self.cancel.cancel(); + self.task.await.unwrap(); + } +} + #[tracing::instrument(skip_all, fields(%exit_code))] +#[allow(clippy::too_many_arguments)] pub async fn shutdown_pageserver( + http_listener: HttpEndpointListener, + libpq_listener: LibpqEndpointListener, + consumption_metrics_worker: ConsumptionMetricsTasks, + disk_usage_eviction_task: Option, tenant_manager: &TenantManager, + background_purges: BackgroundPurges, mut deletion_queue: DeletionQueue, + secondary_controller_tasks: secondary::GlobalTasks, exit_code: i32, ) { use std::time::Duration; // Shut down the libpq endpoint task. This prevents new connections from // being accepted. timed( - task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None), + libpq_listener.0.shutdown(), "shutdown LibpqEndpointListener", Duration::from_secs(1), ) @@ -92,16 +117,44 @@ pub async fn shutdown_pageserver( // Best effort to persist any outstanding deletions, to avoid leaking objects deletion_queue.shutdown(Duration::from_secs(5)).await; + timed( + consumption_metrics_worker.0.shutdown(), + "shutdown consumption metrics", + Duration::from_secs(1), + ) + .await; + + timed( + futures::future::OptionFuture::from(disk_usage_eviction_task.map(|t| t.0.shutdown())), + "shutdown disk usage eviction", + Duration::from_secs(1), + ) + .await; + + timed( + background_purges.shutdown(), + "shutdown background purges", + Duration::from_secs(1), + ) + .await; + // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. timed( - task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None), + http_listener.0.shutdown(), "shutdown http", Duration::from_secs(1), ) .await; + timed( + secondary_controller_tasks.wait(), // cancellation happened in caller + "secondary controller wait", + Duration::from_secs(1), + ) + .await; + // There should be nothing left, but let's be sure timed( task_mgr::shutdown_tasks(None, None, None), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index e67fa656d0..c03567f6ef 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum MetricLayerKind { + Delta, + Image, +} + +static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_bytes", + "Sum of layer physical sizes in bytes", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + +static TIMELINE_LAYER_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_count", + "Number of layers that exist", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + static TIMELINE_ARCHIVE_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_archive_size", @@ -569,6 +594,38 @@ static VALID_LSN_LEASE_COUNT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_circuit_breaker_broken", + "How many times a circuit breaker has broken" + ) + .expect("failed to define a metric") +}); + +pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_circuit_breaker_unbroken", + "How many times a circuit breaker has been un-broken (recovered)" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_in_bytes_total", + "Size of uncompressed data written into image layers" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_out_bytes_total", + "Size of compressed image layer written" + ) + .expect("failed to define a metric") +}); + pub(crate) mod initial_logical_size { use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; @@ -1474,7 +1531,6 @@ pub(crate) enum ComputeCommandKind { Basebackup, Fullbackup, LeaseLsn, - Show, } pub(crate) struct ComputeCommandCounters { @@ -2126,6 +2182,10 @@ pub(crate) struct TimelineMetrics { pub last_record_gauge: IntGauge, pub pitr_history_size: UIntGauge, pub archival_size: UIntGauge, + pub(crate) layer_size_image: UIntGauge, + pub(crate) layer_count_image: UIntGauge, + pub(crate) layer_size_delta: UIntGauge, + pub(crate) layer_count_delta: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size @@ -2208,6 +2268,42 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let layer_size_image = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_count_image = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_size_delta = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + + let layer_count_delta = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2262,6 +2358,10 @@ impl TimelineMetrics { last_record_gauge, pitr_history_size, archival_size, + layer_size_image, + layer_count_image, + layer_size_delta, + layer_count_delta, standby_horizon_gauge, resident_physical_size_gauge, current_logical_size_gauge, @@ -2323,6 +2423,31 @@ impl TimelineMetrics { let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index c10c2f2a0f..6353f713e0 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -36,7 +36,6 @@ use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::ConnectionId; use utils::sync::gate::GateGuard; use utils::{ auth::{Claims, Scope, SwappableJwtAuth}, @@ -66,7 +65,6 @@ use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; use crate::tenant::Tenant; use crate::tenant::Timeline; -use crate::trace::Tracer; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; @@ -126,7 +124,6 @@ pub async fn libpq_listener_main( None, None, "serving compute connection task", - false, page_service_conn_main( tenant_manager.clone(), local_auth, @@ -430,18 +427,6 @@ impl PageServerHandler { .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT) .await?; - // Make request tracer if needed - let mut tracer = if tenant.get_trace_read_requests() { - let connection_id = ConnectionId::generate(); - let path = - tenant - .conf - .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id); - Some(Tracer::new(path)) - } else { - None - }; - // switch client to COPYBOTH pgb.write_message_noflush(&BeMessage::CopyBothResponse)?; self.flush_cancellable(pgb, &tenant.cancel).await?; @@ -473,11 +458,6 @@ impl PageServerHandler { trace!("query: {copy_data_bytes:?}"); fail::fail_point!("ps::handle-pagerequest-message"); - // Trace request if needed - if let Some(t) = tracer.as_mut() { - t.trace(©_data_bytes) - } - let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?; @@ -1498,66 +1478,6 @@ where ))? } }; - } else if let Some(params) = parts.strip_prefix(&["show"]) { - // show - if params.len() != 1 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for config command" - ))); - } - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - - tracing::Span::current().record("tenant_id", field::display(tenant_id)); - - self.check_permission(Some(tenant_id))?; - - COMPUTE_COMMANDS_COUNTERS - .for_command(ComputeCommandKind::Show) - .inc(); - - let tenant = self - .get_active_tenant_with_timeout( - tenant_id, - ShardSelector::Zero, - ACTIVE_TENANT_TIMEOUT, - ) - .await?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::int8_col(b"checkpoint_distance"), - RowDescriptor::int8_col(b"checkpoint_timeout"), - RowDescriptor::int8_col(b"compaction_target_size"), - RowDescriptor::int8_col(b"compaction_period"), - RowDescriptor::int8_col(b"compaction_threshold"), - RowDescriptor::int8_col(b"gc_horizon"), - RowDescriptor::int8_col(b"gc_period"), - RowDescriptor::int8_col(b"image_creation_threshold"), - RowDescriptor::int8_col(b"pitr_interval"), - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(tenant.get_checkpoint_distance().to_string().as_bytes()), - Some( - tenant - .get_checkpoint_timeout() - .as_secs() - .to_string() - .as_bytes(), - ), - Some(tenant.get_compaction_target_size().to_string().as_bytes()), - Some( - tenant - .get_compaction_period() - .as_secs() - .to_string() - .as_bytes(), - ), - Some(tenant.get_compaction_threshold().to_string().as_bytes()), - Some(tenant.get_gc_horizon().to_string().as_bytes()), - Some(tenant.get_gc_period().as_secs().to_string().as_bytes()), - Some(tenant.get_image_creation_threshold().to_string().as_bytes()), - Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), - ]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { return Err(QueryError::Other(anyhow::anyhow!( "unknown command {query_string}" diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 8a6cfea92b..85f3a6e0fb 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -284,6 +284,16 @@ impl Timeline { if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { return Ok(true); } + // then check if the database was already initialized. + // get_rel_exists can be called before dbdir is created. + let buf = version.get(self, DBDIR_KEY, ctx).await?; + let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => Ok(dir.dbdirs), + Err(e) => Err(PageReconstructError::from(e)), + }?; + if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) { + return Ok(false); + } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); let buf = version.get(self, key, ctx).await?; @@ -522,7 +532,7 @@ impl Timeline { ctx: &RequestContext, ) -> Result, PageReconstructError> { let mut max: Option = None; - self.map_all_timestamps(probe_lsn, ctx, |timestamp| { + self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| { if let Some(max_prev) = max { max = Some(max_prev.max(timestamp)); } else { @@ -2031,7 +2041,7 @@ mod tests { #[tokio::test] async fn aux_files_round_trip() -> anyhow::Result<()> { let name = "aux_files_round_trip"; - let harness = TenantHarness::create(name)?; + let harness = TenantHarness::create(name).await?; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 5f46ce3d69..5cd78874c1 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -408,7 +408,6 @@ pub fn spawn( tenant_shard_id: Option, timeline_id: Option, name: &str, - shutdown_process_on_error: bool, future: F, ) -> PageserverTaskId where @@ -437,7 +436,6 @@ where task_id, task_cloned, cancel, - shutdown_process_on_error, future, )); task_mut.join_handle = Some(join_handle); @@ -454,82 +452,78 @@ async fn task_wrapper( task_id: u64, task: Arc, shutdown_token: CancellationToken, - shutdown_process_on_error: bool, future: F, ) where F: Future> + Send + 'static, { debug!("Starting task '{}'", task_name); - let result = SHUTDOWN_TOKEN - .scope( - shutdown_token, - CURRENT_TASK.scope(task, { - // We use AssertUnwindSafe here so that the payload function - // doesn't need to be UnwindSafe. We don't do anything after the - // unwinding that would expose us to unwind-unsafe behavior. - AssertUnwindSafe(future).catch_unwind() - }), - ) - .await; - task_finish(result, task_name, task_id, shutdown_process_on_error).await; -} - -async fn task_finish( - result: std::result::Result< - anyhow::Result<()>, - std::boxed::Box, - >, - task_name: String, - task_id: u64, - shutdown_process_on_error: bool, -) { - // Remove our entry from the global hashmap. - let task = TASKS - .lock() - .unwrap() - .remove(&task_id) - .expect("no task in registry"); - - let mut shutdown_process = false; - { + // wrap the future so we log panics and errors + let tenant_shard_id = task.tenant_shard_id; + let timeline_id = task.timeline_id; + let fut = async move { + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + let result = AssertUnwindSafe(future).catch_unwind().await; match result { Ok(Ok(())) => { debug!("Task '{}' exited normally", task_name); } Ok(Err(err)) => { - if shutdown_process_on_error { - error!( - "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - shutdown_process = true; - } else { - error!( - "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - } + error!( + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, tenant_shard_id, timeline_id, err + ); } Err(err) => { - if shutdown_process_on_error { - error!( - "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - shutdown_process = true; - } else { - error!( - "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - } + error!( + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, tenant_shard_id, timeline_id, err + ); } } - } + }; - if shutdown_process { - std::process::exit(1); + // add the task-locals + let fut = CURRENT_TASK.scope(task, fut); + let fut = SHUTDOWN_TOKEN.scope(shutdown_token, fut); + + // poll future to completion + fut.await; + + // Remove our entry from the global hashmap. + TASKS + .lock() + .unwrap() + .remove(&task_id) + .expect("no task in registry"); +} + +pub async fn exit_on_panic_or_error( + task_name: &'static str, + future: impl Future>, +) -> T +where + E: std::fmt::Debug, +{ + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + let result = AssertUnwindSafe(future).catch_unwind().await; + match result { + Ok(Ok(val)) => val, + Ok(Err(err)) => { + error!( + task_name, + "Task exited with error, exiting process: {err:?}" + ); + std::process::exit(1); + } + Err(panic_obj) => { + error!(task_name, "Task panicked, exiting process: {panic_obj:?}"); + std::process::exit(1); + } } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index eef8dc104c..f83c7021e3 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -21,6 +21,7 @@ use futures::FutureExt; use futures::StreamExt; use pageserver_api::models; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::TimelineArchivalState; use pageserver_api::models::TimelineState; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::WalRedoManagerStatus; @@ -30,6 +31,7 @@ use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeoutOrCancel; +use std::collections::BTreeMap; use std::fmt; use std::time::SystemTime; use storage_broker::BrokerClientChannel; @@ -39,6 +41,7 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; use utils::backoff; +use utils::circuit_breaker::CircuitBreaker; use utils::completion; use utils::crashsafe::path_with_suffix_extension; use utils::failpoint_support; @@ -76,7 +79,8 @@ use crate::is_uninit_mark; use crate::l0_flush::L0FlushGlobalState; use crate::metrics::TENANT; use crate::metrics::{ - remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, + remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, + TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, }; use crate::repository::GcResult; use crate::task_mgr; @@ -92,14 +96,12 @@ use crate::tenant::storage_layer::ImageLayer; use crate::walredo; use crate::InitializationOrder; use std::collections::hash_map::Entry; -use std::collections::BTreeSet; use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Debug; use std::fmt::Display; use std::fs; use std::fs::File; -use std::ops::Bound::Included; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; @@ -276,6 +278,10 @@ pub struct Tenant { eviction_task_tenant_state: tokio::sync::Mutex, + /// Track repeated failures to compact, so that we can back off. + /// Overhead of mutex is acceptable because compaction is done with a multi-second period. + compaction_circuit_breaker: std::sync::Mutex, + /// If the tenant is in Activating state, notify this to encourage it /// to proceed to Active as soon as possible, rather than waiting for lazy /// background warmup. @@ -714,7 +720,6 @@ impl Tenant { Some(tenant_shard_id), None, "attach tenant", - false, async move { info!( @@ -1222,6 +1227,14 @@ impl Tenant { Ok(timeline_preloads) } + pub async fn apply_timeline_archival_config( + &self, + _timeline_id: TimelineId, + _config: TimelineArchivalState, + ) -> anyhow::Result<()> { + Ok(()) + } + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { self.tenant_shard_id } @@ -1641,13 +1654,31 @@ impl Tenant { timelines_to_compact }; + // Before doing any I/O work, check our circuit breaker + if self.compaction_circuit_breaker.lock().unwrap().is_broken() { + info!("Skipping compaction due to previous failures"); + return Ok(()); + } + for (timeline_id, timeline) in &timelines_to_compact { timeline .compact(cancel, EnumSet::empty(), ctx) .instrument(info_span!("compact_timeline", %timeline_id)) - .await?; + .await + .map_err(|e| { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, &e); + e + })?; } + self.compaction_circuit_breaker + .lock() + .unwrap() + .success(&CIRCUIT_BREAKERS_UNBROKEN); + Ok(()) } @@ -1733,6 +1764,9 @@ impl Tenant { .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); + // Before activation, populate each Timeline's GcInfo with information about its children + self.initialize_gc_info(&timelines_accessor); + // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. tasks::start_background_loops(self, background_jobs_can_start); @@ -2341,13 +2375,6 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } - pub fn get_trace_read_requests(&self) -> bool { - let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); - tenant_conf - .trace_read_requests - .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) - } - pub fn get_min_resident_size_override(&self) -> Option { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -2570,6 +2597,14 @@ impl Tenant { cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), + compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new( + format!("compaction-{tenant_shard_id}"), + 5, + // Compaction can be a very expensive operation, and might leak disk space. It also ought + // to be infallible, as long as remote storage is available. So if it repeatedly fails, + // use an extremely long backoff. + Some(Duration::from_secs(3600 * 24)), + )), activate_now_sem: tokio::sync::Semaphore::new(0), cancel: CancellationToken::default(), gate: Gate::default(), @@ -2765,6 +2800,55 @@ impl Tenant { .await } + /// Populate all Timelines' `GcInfo` with information about their children. We do not set the + /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] + /// + /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion. + fn initialize_gc_info( + &self, + timelines: &std::sync::MutexGuard>>, + ) { + // This function must be called before activation: after activation timeline create/delete operations + // might happen, and this function is not safe to run concurrently with those. + assert!(!self.is_active()); + + // Scan all timelines. For each timeline, remember the timeline ID and + // the branch point where it was created. + let mut all_branchpoints: BTreeMap> = BTreeMap::new(); + timelines.iter().for_each(|(timeline_id, timeline_entry)| { + if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { + let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id)); + } + }); + + // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines + let horizon = self.get_gc_horizon(); + + // Populate each timeline's GcInfo with information about its child branches + for timeline in timelines.values() { + let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + .remove(&timeline.timeline_id) + .unwrap_or_default(); + + branchpoints.sort_by_key(|b| b.0); + + let mut target = timeline.gc_info.write().unwrap(); + + target.retain_lsns = branchpoints; + + let space_cutoff = timeline + .get_last_record_lsn() + .checked_sub(horizon) + .unwrap_or(Lsn(0)); + + target.cutoffs = GcCutoffs { + space: space_cutoff, + time: Lsn::INVALID, + }; + } + } + async fn refresh_gc_info_internal( &self, target_timeline_id: Option, @@ -2787,6 +2871,11 @@ impl Tenant { .cloned() .collect::>(); + if target_timeline_id.is_some() && timelines.is_empty() { + // We were to act on a particular timeline and it wasn't found + return Err(GcError::TimelineNotFound); + } + let mut gc_cutoffs: HashMap = HashMap::with_capacity(timelines.len()); @@ -2809,68 +2898,63 @@ impl Tenant { // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; - // Scan all timelines. For each timeline, remember the timeline ID and - // the branch point where it was created. - let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = { - let timelines = self.timelines.lock().unwrap(); - let mut all_branchpoints = BTreeSet::new(); - let timelines = { - if let Some(target_timeline_id) = target_timeline_id.as_ref() { - if timelines.get(target_timeline_id).is_none() { - return Err(GcError::TimelineNotFound); + // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they + // depend on. So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here + // and fail out if it's inaccurate. + // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427) + { + let mut all_branchpoints: BTreeMap> = + BTreeMap::new(); + timelines.iter().for_each(|timeline| { + if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() { + let ancestor_children = + all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id)); + } + }); + + for timeline in &timelines { + let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + .remove(&timeline.timeline_id) + .unwrap_or_default(); + + branchpoints.sort_by_key(|b| b.0); + + let target = timeline.gc_info.read().unwrap(); + + // We require that retain_lsns contains everything in `branchpoints`, but not that + // they are exactly equal: timeline deletions can race with us, so retain_lsns + // may contain some extra stuff. It is safe to have extra timelines in there, because it + // just means that we retain slightly more data than we otherwise might. + let have_branchpoints = target.retain_lsns.iter().copied().collect::>(); + for b in &branchpoints { + if !have_branchpoints.contains(b) { + tracing::error!( + "Bug: `retain_lsns` is set incorrectly. Expected be {:?}, but found {:?}", + branchpoints, + target.retain_lsns + ); + debug_assert!(false); + // Do not GC based on bad information! + // (ab-use an existing GcError type rather than adding a new one, since this is a + // "should never happen" check that will be removed soon). + return Err(GcError::Remote(anyhow::anyhow!( + "retain_lsns failed validation!" + ))); } - }; - - timelines - .iter() - .map(|(_timeline_id, timeline_entry)| { - if let Some(ancestor_timeline_id) = - &timeline_entry.get_ancestor_timeline_id() - { - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timeline_id) = target_timeline_id { - if ancestor_timeline_id == &timeline_id { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - // Collect branchpoints for all timelines - else { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - - timeline_entry.clone() - }) - .collect::>() - }; - (all_branchpoints, timelines) - }; + } + } + } // Ok, we now know all the branch points. // Update the GC information for each timeline. let mut gc_timelines = Vec::with_capacity(timelines.len()); for timeline in timelines { - // If target_timeline is specified, ignore all other timelines + // We filtered the timeline list above if let Some(target_timeline_id) = target_timeline_id { - if timeline.timeline_id != target_timeline_id { - continue; - } + assert_eq!(target_timeline_id, timeline.timeline_id); } - let branchpoints: Vec = all_branchpoints - .range(( - Included((timeline.timeline_id, Lsn(0))), - Included((timeline.timeline_id, Lsn(u64::MAX))), - )) - .map(|&x| x.1) - .collect(); - { let mut target = timeline.gc_info.write().unwrap(); @@ -2887,7 +2971,7 @@ impl Tenant { if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) { target.within_ancestor_pitr = - timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr; + timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time; } } @@ -2903,25 +2987,17 @@ impl Tenant { timeline.metrics.pitr_history_size.set( timeline .get_last_record_lsn() - .checked_sub(target.cutoffs.pitr) + .checked_sub(target.cutoffs.time) .unwrap_or(Lsn(0)) .0, ); - match gc_cutoffs.remove(&timeline.timeline_id) { - Some(cutoffs) => { - target.retain_lsns = branchpoints; - target.cutoffs = cutoffs; - } - None => { - // reasons for this being unavailable: - // - this timeline was created while we were finding cutoffs - // - lsn for timestamp search fails for this timeline repeatedly - // - // in both cases, refreshing the branchpoints is correct. - target.retain_lsns = branchpoints; - } - }; + // Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline? + // - this timeline was created while we were finding cutoffs + // - lsn for timestamp search fails for this timeline repeatedly + if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) { + target.cutoffs = cutoffs.clone(); + } } gc_timelines.push(timeline); @@ -3718,7 +3794,6 @@ pub(crate) mod harness { walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), - trace_read_requests: Some(tenant_conf.trace_read_requests), eviction_policy: Some(tenant_conf.eviction_policy), min_resident_size_override: tenant_conf.min_resident_size_override, evictions_low_residence_duration_metric_threshold: Some( @@ -3764,7 +3839,7 @@ pub(crate) mod harness { } impl TenantHarness { - pub fn create_custom( + pub async fn create_custom( test_name: &'static str, tenant_conf: TenantConf, tenant_id: TenantId, @@ -3800,7 +3875,7 @@ pub(crate) mod harness { }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; - let remote_storage = GenericRemoteStorage::from_config(&config).unwrap(); + let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap(); let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone())); Ok(Self { @@ -3815,7 +3890,7 @@ pub(crate) mod harness { }) } - pub fn create(test_name: &'static str) -> anyhow::Result { + pub async fn create(test_name: &'static str) -> anyhow::Result { // Disable automatic GC and compaction to make the unit tests more deterministic. // The tests perform them manually if needed. let tenant_conf = TenantConf { @@ -3832,6 +3907,7 @@ pub(crate) mod harness { shard, Generation::new(0xdeadbeef), ) + .await } pub fn span(&self) -> tracing::Span { @@ -3959,6 +4035,7 @@ mod tests { use storage_layer::PersistentLayerKey; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; + use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; use timeline::{DeltaLayerTestDesc, GcInfo}; use utils::bin_ser::BeSer; use utils::id::TenantId; @@ -3968,7 +4045,7 @@ mod tests { #[tokio::test] async fn test_basic() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4015,7 +4092,8 @@ mod tests { #[tokio::test] async fn no_duplicate_timelines() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")? + let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines") + .await? .load() .await; let _ = tenant @@ -4047,7 +4125,7 @@ mod tests { async fn test_branch() -> anyhow::Result<()> { use std::str::from_utf8; - let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4169,7 +4247,8 @@ mod tests { #[tokio::test] async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? + TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data") + .await? .load() .await; let tline = tenant @@ -4216,7 +4295,8 @@ mod tests { #[tokio::test] async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")? + TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn") + .await? .load() .await; @@ -4238,7 +4318,7 @@ mod tests { .source() .unwrap() .to_string() - .contains("is earlier than latest GC horizon")); + .contains("is earlier than latest GC cutoff")); } } @@ -4271,7 +4351,8 @@ mod tests { #[tokio::test] async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")? + TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline") + .await? .load() .await; let tline = tenant @@ -4305,7 +4386,7 @@ mod tests { { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], Lsn(0x40)); + assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID)); } // You can read the key from the child branch even though the parent is @@ -4328,7 +4409,8 @@ mod tests { #[tokio::test] async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")? + TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child") + .await? .load() .await; let tline = tenant @@ -4358,10 +4440,10 @@ mod tests { } #[tokio::test] async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> { - let (tenant, ctx) = - TenantHarness::create("test_parent_keeps_data_forever_after_branching")? - .load() - .await; + let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4399,7 +4481,7 @@ mod tests { #[tokio::test] async fn timeline_load() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load"; - let harness = TenantHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME).await?; { let (tenant, ctx) = harness.load().await; let tline = tenant @@ -4426,7 +4508,7 @@ mod tests { #[tokio::test] async fn timeline_load_with_ancestor() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load_with_ancestor"; - let harness = TenantHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME).await?; // create two timelines { let (tenant, ctx) = harness.load().await; @@ -4474,7 +4556,10 @@ mod tests { #[tokio::test] async fn delta_layer_dumping() -> anyhow::Result<()> { use storage_layer::AsLayerDesc; - let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_layer_dumping") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4501,7 +4586,7 @@ mod tests { #[tokio::test] async fn test_images() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_images")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4672,7 +4757,7 @@ mod tests { // #[tokio::test] async fn test_bulk_insert() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_bulk_insert")?; + let harness = TenantHarness::create("test_bulk_insert").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) @@ -4703,7 +4788,7 @@ mod tests { // so the search can stop at the first delta layer and doesn't traverse any deeper. #[tokio::test] async fn test_get_vectored() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_get_vectored")?; + let harness = TenantHarness::create("test_get_vectored").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) @@ -4781,7 +4866,7 @@ mod tests { #[tokio::test] async fn test_get_vectored_aux_files() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_get_vectored_aux_files")?; + let harness = TenantHarness::create("test_get_vectored_aux_files").await?; let (tenant, ctx) = harness.load().await; let tline = tenant @@ -4867,7 +4952,8 @@ mod tests { TenantId::generate(), ShardIdentity::unsharded(), Generation::new(0xdeadbeef), - )?; + ) + .await?; let (tenant, ctx) = harness.load().await; let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); @@ -5010,7 +5096,7 @@ mod tests { // ``` #[tokio::test] async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?; + let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?; let (tenant, ctx) = harness.load().await; let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); @@ -5159,7 +5245,7 @@ mod tests { name: &'static str, compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { - let mut harness = TenantHarness::create(name)?; + let mut harness = TenantHarness::create(name).await?; harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { kind: compaction_algorithm, }; @@ -5243,7 +5329,8 @@ mod tests { #[tokio::test] async fn test_traverse_branches() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_traverse_branches")? + let (tenant, ctx) = TenantHarness::create("test_traverse_branches") + .await? .load() .await; let mut tline = tenant @@ -5333,7 +5420,8 @@ mod tests { #[tokio::test] async fn test_traverse_ancestors() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")? + let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors") + .await? .load() .await; let mut tline = tenant @@ -5399,7 +5487,8 @@ mod tests { #[tokio::test] async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")? + let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable") + .await? .load() .await; @@ -5468,7 +5557,7 @@ mod tests { #[tokio::test] async fn test_create_guard_crash() -> anyhow::Result<()> { let name = "test_create_guard_crash"; - let harness = TenantHarness::create(name)?; + let harness = TenantHarness::create(name).await?; { let (tenant, ctx) = harness.load().await; let tline = tenant @@ -5521,7 +5610,7 @@ mod tests { name: &'static str, compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { - let mut harness = TenantHarness::create(name)?; + let mut harness = TenantHarness::create(name).await?; harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { kind: compaction_algorithm, }; @@ -5545,7 +5634,7 @@ mod tests { #[tokio::test] async fn test_metadata_scan() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_scan")?; + let harness = TenantHarness::create("test_metadata_scan").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -5664,7 +5753,7 @@ mod tests { #[tokio::test] async fn test_metadata_compaction_trigger() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_compaction_trigger")?; + let harness = TenantHarness::create("test_metadata_compaction_trigger").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -5723,7 +5812,9 @@ mod tests { #[tokio::test] async fn test_branch_copies_dirty_aux_file_flag() { - let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap(); + let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag") + .await + .unwrap(); // the default aux file policy to switch is v1 if not set by the admins assert_eq!( @@ -5825,7 +5916,9 @@ mod tests { #[tokio::test] async fn aux_file_policy_switch() { - let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap(); + let mut harness = TenantHarness::create("aux_file_policy_switch") + .await + .unwrap(); harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode let (tenant, ctx) = harness.load().await; @@ -5999,7 +6092,9 @@ mod tests { #[tokio::test] async fn aux_file_policy_force_switch() { - let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap(); + let mut harness = TenantHarness::create("aux_file_policy_force_switch") + .await + .unwrap(); harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1; let (tenant, ctx) = harness.load().await; @@ -6060,7 +6155,9 @@ mod tests { #[tokio::test] async fn aux_file_policy_auto_detect() { - let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap(); + let mut harness = TenantHarness::create("aux_file_policy_auto_detect") + .await + .unwrap(); harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode let (tenant, ctx) = harness.load().await; @@ -6123,7 +6220,7 @@ mod tests { #[tokio::test] async fn test_metadata_image_creation() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_image_creation")?; + let harness = TenantHarness::create("test_metadata_image_creation").await?; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -6222,7 +6319,7 @@ mod tests { #[tokio::test] async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; + let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?; let (tenant, ctx) = harness.load().await; let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); @@ -6294,7 +6391,7 @@ mod tests { #[tokio::test] async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?; + let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?; let (tenant, ctx) = harness.load().await; let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); @@ -6386,7 +6483,7 @@ mod tests { #[tokio::test] async fn test_metadata_tombstone_reads() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_tombstone_reads")?; + let harness = TenantHarness::create("test_metadata_tombstone_reads").await?; let (tenant, ctx) = harness.load().await; let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -6466,7 +6563,9 @@ mod tests { #[tokio::test] async fn test_metadata_tombstone_image_creation() { - let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap(); + let harness = TenantHarness::create("test_metadata_tombstone_image_creation") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); @@ -6538,8 +6637,9 @@ mod tests { #[tokio::test] async fn test_metadata_tombstone_empty_image_creation() { - let harness = - TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap(); + let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -6602,7 +6702,7 @@ mod tests { #[tokio::test] async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?; + let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { @@ -6694,8 +6794,8 @@ mod tests { { // Update GC info let mut guard = tline.gc_info.write().unwrap(); - guard.cutoffs.pitr = Lsn(0x30); - guard.cutoffs.horizon = Lsn(0x30); + guard.cutoffs.time = Lsn(0x30); + guard.cutoffs.space = Lsn(0x30); } let expected_result = [ @@ -6786,7 +6886,7 @@ mod tests { vec![ // Image layer at GC horizon PersistentLayerKey { - key_range: Key::MIN..get_key(10), + key_range: Key::MIN..Key::MAX, lsn_range: Lsn(0x30)..Lsn(0x31), is_delta: false }, @@ -6810,7 +6910,7 @@ mod tests { #[tokio::test] async fn test_neon_test_record() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_neon_test_record")?; + let harness = TenantHarness::create("test_neon_test_record").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { @@ -6891,7 +6991,7 @@ mod tests { #[tokio::test] async fn test_lsn_lease() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await; let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_lsn = Lsn(0x100); @@ -6980,7 +7080,7 @@ mod tests { #[tokio::test] async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas")?; + let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { @@ -7085,8 +7185,8 @@ mod tests { *guard = GcInfo { retain_lsns: vec![], cutoffs: GcCutoffs { - pitr: Lsn(0x30), - horizon: Lsn(0x30), + time: Lsn(0x30), + space: Lsn(0x30), }, leases: Default::default(), within_ancestor_pitr: false, @@ -7158,4 +7258,323 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_generate_key_retention() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_generate_key_retention").await?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + tline.force_advance_lsn(Lsn(0x70)); + let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let history = vec![ + ( + key, + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"0x10")), + ), + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + key, + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ( + key, + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + ), + ( + key, + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + key, + Lsn(0x80), + Value::WalRecord(NeonWalRecord::wal_append(";0x80")), + ), + ( + key, + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]; + let res = tline + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[Lsn(0x20), Lsn(0x40), Lsn(0x50)], + 3, + ) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![ + ( + Lsn(0x20), + KeyLogAtLsn(vec![( + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20")), + )]), + ), + ( + Lsn(0x40), + KeyLogAtLsn(vec![ + ( + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ]), + ), + ( + Lsn(0x50), + KeyLogAtLsn(vec![( + Lsn(0x50), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40;0x50")), + )]), + ), + ( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + )]), + ), + ], + above_horizon: KeyLogAtLsn(vec![ + ( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + Lsn(0x80), + Value::WalRecord(NeonWalRecord::wal_append(";0x80")), + ), + ( + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]), + }; + assert_eq!(res, expected_res); + // TODO: more tests with mixed image + delta, adding with k-merge test cases; e2e compaction test + Ok(()) + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> { + let harness = + TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + ), + ( + get_key(3), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![ + (Lsn(0x10), tline.timeline_id), + (Lsn(0x20), tline.timeline_id), + ], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_20 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_10 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let verify_result = || async { + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x30), &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x20), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_20[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x10), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_10[idx] + ); + } + }; + + verify_result().await; + + let cancel = CancellationToken::new(); + tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + + verify_result().await; + + Ok(()) + } } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index e98ed66ef9..791eefebe9 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -137,14 +137,14 @@ impl<'a> BlockCursor<'a> { } /// Reserved bits for length and compression -const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0; +pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0; /// The maximum size of blobs we support. The highest few bits /// are reserved for compression and other further uses. const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff; -const BYTE_UNCOMPRESSED: u8 = 0x80; -const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10; +pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80; +pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10; /// A wrapper of `VirtualFile` that allows users to write blobs. /// @@ -390,51 +390,63 @@ impl BlobWriter { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use super::*; use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef}; + use camino::Utf8PathBuf; + use camino_tempfile::Utf8TempDir; use rand::{Rng, SeedableRng}; async fn round_trip_test(blobs: &[Vec]) -> Result<(), Error> { round_trip_test_compressed::(blobs, false).await } - async fn round_trip_test_compressed( + pub(crate) async fn write_maybe_compressed( blobs: &[Vec], compression: bool, - ) -> Result<(), Error> { + ctx: &RequestContext, + ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec), Error> { let temp_dir = camino_tempfile::tempdir()?; let pathbuf = temp_dir.path().join("file"); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); // Write part (in block to drop the file) let mut offsets = Vec::new(); { - let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?; + let file = VirtualFile::create(pathbuf.as_path(), ctx).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { let (_, res) = if compression { wtr.write_blob_maybe_compressed( blob.clone(), - &ctx, + ctx, ImageCompressionAlgorithm::Zstd { level: Some(1) }, ) .await } else { - wtr.write_blob(blob.clone(), &ctx).await + wtr.write_blob(blob.clone(), ctx).await }; let offs = res?; offsets.push(offs); } // Write out one page worth of zeros so that we can // read again with read_blk - let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await; + let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await; let offs = res?; println!("Writing final blob at offs={offs}"); - wtr.flush_buffer(&ctx).await?; + wtr.flush_buffer(ctx).await?; } + Ok((temp_dir, pathbuf, offsets)) + } - let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?; + async fn round_trip_test_compressed( + blobs: &[Vec], + compression: bool, + ) -> Result<(), Error> { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let (_temp_dir, pathbuf, offsets) = + write_maybe_compressed::(blobs, compression, &ctx).await?; + + let file = VirtualFile::open(pathbuf, &ctx).await?; let rdr = BlockReaderRef::VirtualFile(&file); let rdr = BlockCursor::new_with_compression(rdr, compression); for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { @@ -447,7 +459,7 @@ mod tests { Ok(()) } - fn random_array(len: usize) -> Vec { + pub(crate) fn random_array(len: usize) -> Vec { let mut rng = rand::thread_rng(); (0..len).map(|_| rng.gen()).collect::<_>() } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 5b532e4830..48ff17db94 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -335,7 +335,6 @@ pub struct TenantConf { /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, /// to avoid eager reconnects. pub max_lsn_wal_lag: NonZeroU64, - pub trace_read_requests: bool, pub eviction_policy: EvictionPolicy, pub min_resident_size_override: Option, // See the corresponding metric's help string. @@ -436,10 +435,6 @@ pub struct TenantConfOpt { #[serde(default)] pub max_lsn_wal_lag: Option, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub trace_read_requests: Option, - #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub eviction_policy: Option, @@ -519,9 +514,6 @@ impl TenantConfOpt { .lagging_wal_timeout .unwrap_or(global_conf.lagging_wal_timeout), max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), - trace_read_requests: self - .trace_read_requests - .unwrap_or(global_conf.trace_read_requests), eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), min_resident_size_override: self .min_resident_size_override @@ -581,7 +573,6 @@ impl Default for TenantConf { .expect("cannot parse default walreceiver lagging wal timeout"), max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) .expect("cannot parse default max walreceiver Lsn wal lag"), - trace_read_requests: false, eviction_policy: EvictionPolicy::NoEviction, min_resident_size_override: None, evictions_low_residence_duration_metric_threshold: humantime::parse_duration( @@ -659,7 +650,6 @@ impl From for models::TenantConfig { walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime), lagging_wal_timeout: value.lagging_wal_timeout.map(humantime), max_lsn_wal_lag: value.max_lsn_wal_lag, - trace_read_requests: value.trace_read_requests, eviction_policy: value.eviction_policy, min_resident_size_override: value.min_resident_size_override, evictions_low_residence_duration_metric_threshold: value diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index b76498b608..1583a3826a 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -262,7 +262,7 @@ where pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a> where - R: 'a, + R: 'a + Send, { DiskBtreeIterator { stream: Box::pin(self.into_stream(start_key, ctx)), @@ -521,7 +521,7 @@ where pub struct DiskBtreeIterator<'a> { #[allow(clippy::type_complexity)] stream: std::pin::Pin< - Box, u64), DiskBtreeError>> + 'a>, + Box, u64), DiskBtreeError>> + 'a + Send>, >, } @@ -550,10 +550,10 @@ where /// We maintain the length of the stack to be always greater than zero. /// Two exceptions are: /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one. - /// So because other methods cannot see the intermediate state invariant still holds. + /// So because other methods cannot see the intermediate state invariant still holds. /// 2. `Self::finish`. It consumes self and does not return it back, - /// which means that this is where the structure is destroyed. - /// Thus stack of zero length cannot be observed by other methods. + /// which means that this is where the structure is destroyed. + /// Thus stack of zero length cannot be observed by other methods. stack: Vec>, /// Last key that was appended to the tree. Used to sanity check that append diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 2724a5cc07..72167d02ab 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -463,7 +463,7 @@ impl LayerMap { pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) { // TODO: See #3869, resulting #4088, attempted fix and repro #4094 - if Self::is_l0(&layer_desc) { + if Self::is_l0(&layer_desc.key_range) { self.l0_delta_layers.push(layer_desc.clone().into()); } @@ -482,7 +482,7 @@ impl LayerMap { self.historic .remove(historic_layer_coverage::LayerKey::from(layer_desc)); let layer_key = layer_desc.key(); - if Self::is_l0(layer_desc) { + if Self::is_l0(&layer_desc.key_range) { let len_before = self.l0_delta_layers.len(); let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers); l0_delta_layers.retain(|other| other.key() != layer_key); @@ -598,8 +598,9 @@ impl LayerMap { coverage } - pub fn is_l0(layer: &PersistentLayerDesc) -> bool { - layer.get_key_range() == (Key::MIN..Key::MAX) + /// Check if the key range resembles that of an L0 layer. + pub fn is_l0(key_range: &Range) -> bool { + key_range == &(Key::MIN..Key::MAX) } /// This function determines which layers are counted in `count_deltas`: @@ -626,7 +627,7 @@ impl LayerMap { /// than just the current partition_range. pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range) -> bool { // Case 1 - if !Self::is_l0(layer) { + if !Self::is_l0(&layer.key_range) { return true; } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index b0159e22bf..f23e6ff9d6 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -36,7 +36,7 @@ use crate::control_plane_client::{ use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; -use crate::task_mgr::{self, TaskKind}; +use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, }; @@ -225,26 +225,98 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result>); +enum BackgroundPurgesInner { + Open(tokio::task::JoinSet<()>), + // we use the async mutex for coalescing + ShuttingDown(Arc>>), +} - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::MgmtRequest, - task_tenant_id, - None, - "tenant_files_delete", - false, - async move { - fs::remove_dir_all(tmp_path.as_path()) - .await - .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) - }, - ); +impl Default for BackgroundPurges { + fn default() -> Self { + Self(Arc::new(std::sync::Mutex::new( + BackgroundPurgesInner::Open(JoinSet::new()), + ))) + } +} + +impl BackgroundPurges { + /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in + /// the background, and thereby avoid blocking any API requests on this deletion completing. + /// + /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. + /// Thus the [`BackgroundPurges`] type to keep track of these tasks. + pub fn spawn(&self, tmp_path: Utf8PathBuf) { + let mut guard = self.0.lock().unwrap(); + let jset = match &mut *guard { + BackgroundPurgesInner::Open(ref mut jset) => jset, + BackgroundPurgesInner::ShuttingDown(_) => { + warn!("trying to spawn background purge during shutdown, ignoring"); + return; + } + }; + jset.spawn_on( + async move { + if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await { + // should we fatal_io_error here? + warn!(%error, path=%tmp_path, "failed to purge tenant directory"); + } + } + .instrument(info_span!(parent: None, "background_purge")), + BACKGROUND_RUNTIME.handle(), + ); + } + + /// When this future completes, all background purges have completed. + /// The first poll of the future will already lock out new background purges spawned via [`Self::spawn`]. + /// + /// Concurrent calls will coalesce. + /// + /// # Cancellation-Safety + /// + /// If this future is dropped before polled to completion, concurrent and subsequent + /// instances of this future will continue to be correct. + #[instrument(skip_all)] + pub async fn shutdown(&self) { + let jset = { + let mut guard = self.0.lock().unwrap(); + match &mut *guard { + BackgroundPurgesInner::Open(jset) => { + *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new( + std::mem::take(jset), + ))) + } + BackgroundPurgesInner::ShuttingDown(_) => { + // calling shutdown multiple times is most likely a bug in pageserver shutdown code + warn!("already shutting down"); + } + }; + match &mut *guard { + BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(), + BackgroundPurgesInner::Open(_) => { + unreachable!("above code transitions into shut down state"); + } + } + }; + let mut jset = jset.lock().await; // concurrent callers coalesce here + while let Some(res) = jset.join_next().await { + match res { + Ok(()) => {} + Err(e) if e.is_panic() => { + // If it panicked, the error is already logged by the panic hook. + } + Err(e) if e.is_cancelled() => { + unreachable!("we don't cancel the joinset or runtime") + } + Err(e) => { + // No idea when this can happen, but let's log it. + warn!(%e, "background purge task failed or panicked"); + } + } + } + } } static TENANTS: Lazy> = @@ -270,6 +342,8 @@ pub struct TenantManager { // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or // when the tenant detaches. cancel: CancellationToken, + + background_purges: BackgroundPurges, } fn emergency_generations( @@ -447,6 +521,7 @@ pub(crate) enum DeleteTenantError { #[instrument(skip_all)] pub async fn init_tenant_mgr( conf: &'static PageServerConf, + background_purges: BackgroundPurges, resources: TenantSharedResources, init_order: InitializationOrder, cancel: CancellationToken, @@ -512,7 +587,7 @@ pub async fn init_tenant_mgr( match safe_rename_tenant_dir(&tenant_dir_path).await { Ok(tmp_path) => { - spawn_background_purge(tmp_path); + background_purges.spawn(tmp_path); } Err(e) => { error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), @@ -634,6 +709,7 @@ pub async fn init_tenant_mgr( tenants: &TENANTS, resources, cancel: CancellationToken::new(), + background_purges, }) } @@ -1353,6 +1429,7 @@ impl TenantManager { async fn delete_local( conf: &PageServerConf, + background_purges: &BackgroundPurges, tenant_shard_id: &TenantShardId, ) -> anyhow::Result<()> { let local_tenant_directory = conf.tenant_path(tenant_shard_id); @@ -1361,7 +1438,7 @@ impl TenantManager { .with_context(|| { format!("local tenant directory {local_tenant_directory:?} rename") })?; - spawn_background_purge(tmp_dir); + background_purges.spawn(tmp_dir); Ok(()) } @@ -1379,12 +1456,12 @@ impl TenantManager { barrier.wait().await; } } - delete_local(self.conf, &tenant_shard_id).await?; + delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?; } Some(TenantSlot::Secondary(secondary_tenant)) => { secondary_tenant.shutdown().await; - delete_local(self.conf, &tenant_shard_id).await?; + delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?; } Some(TenantSlot::InProgress(_)) => unreachable!(), None => {} @@ -1655,7 +1732,7 @@ impl TenantManager { let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; - spawn_background_purge(tmp_path); + self.background_purges.spawn(tmp_path); fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( "failpoint" @@ -1831,7 +1908,7 @@ impl TenantManager { let tmp_path = self .detach_tenant0(conf, tenant_shard_id, deletion_queue_client) .await?; - spawn_background_purge(tmp_path); + self.background_purges.spawn(tmp_path); Ok(()) } @@ -2698,7 +2775,9 @@ mod tests { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully // wait for it to complete before proceeding. - let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap(); + let h = TenantHarness::create("shutdown_awaits_in_progress_tenant") + .await + .unwrap(); let (t, _ctx) = h.load().await; // harness loads it to active, which is forced and nothing is running on the tenant diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index bc9364de61..c75d1eaa5e 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -241,7 +241,7 @@ use self::index::IndexPart; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; -use super::upload_queue::SetDeletedFlagProgress; +use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::Generation; pub(crate) use download::{ @@ -1525,7 +1525,6 @@ impl RemoteTimelineClient { Some(self.tenant_shard_id), Some(self.timeline_id), "remote upload", - false, async move { self_rc.perform_upload_task(task).await; Ok(()) @@ -1930,6 +1929,31 @@ impl RemoteTimelineClient { } } } + + /// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue + /// externally to RemoteTimelineClient. + pub(crate) fn initialized_upload_queue( + &self, + ) -> Result, NotInitialized> { + let mut inner = self.upload_queue.lock().unwrap(); + inner.initialized_mut()?; + Ok(UploadQueueAccessor { inner }) + } +} + +pub(crate) struct UploadQueueAccessor<'a> { + inner: std::sync::MutexGuard<'a, UploadQueue>, +} + +impl<'a> UploadQueueAccessor<'a> { + pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart { + match &*self.inner { + UploadQueue::Initialized(x) => &x.clean.0, + UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { + unreachable!("checked before constructing") + } + } + } } pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath { @@ -2103,7 +2127,7 @@ mod tests { impl TestSetup { async fn new(test_name: &str) -> anyhow::Result { let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}"))); - let harness = TenantHarness::create(test_name)?; + let harness = TenantHarness::create(test_name).await?; let (tenant, ctx) = harness.load().await; let timeline = tenant diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 6233a3477e..b439df8edb 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -176,6 +176,24 @@ pub(crate) struct Lineage { /// /// If you are adding support for detaching from a hierarchy, consider changing the ancestry /// into a `Vec<(TimelineId, Lsn)>` to be a path instead. + // FIXME: this is insufficient even for path of two timelines for future wal recovery + // purposes: + // + // assuming a "old main" which has received most of the WAL, and has a branch "new main", + // starting a bit before "old main" last_record_lsn. the current version works fine, + // because we will know to replay wal and branch at the recorded Lsn to do wal recovery. + // + // then assuming "new main" would similarly receive a branch right before its last_record_lsn, + // "new new main". the current implementation would just store ("new main", ancestor_lsn, _) + // here. however, we cannot recover from WAL using only that information, we would need the + // whole ancestry here: + // + // ```json + // [ + // ["old main", ancestor_lsn("new main"), _], + // ["new main", ancestor_lsn("new new main"), _] + // ] + // ``` #[serde(skip_serializing_if = "Option::is_none", default)] original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>, } @@ -217,6 +235,14 @@ impl Lineage { self.original_ancestor .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn) } + + pub(crate) fn is_detached_from_original_ancestor(&self) -> bool { + self.original_ancestor.is_some() + } + + pub(crate) fn is_reparented(&self) -> bool { + !self.reparenting_history.is_empty() + } } #[cfg(test)] diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index a233d11c4a..3132a28b12 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -31,6 +31,7 @@ use pageserver_api::{ }; use remote_storage::GenericRemoteStorage; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::instrument; use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate}; @@ -293,15 +294,50 @@ impl SecondaryController { } } +pub struct GlobalTasks { + cancel: CancellationToken, + uploader: JoinHandle<()>, + downloader: JoinHandle<()>, +} + +impl GlobalTasks { + /// Caller is responsible for requesting shutdown via the cancellation token that was + /// passed to [`spawn_tasks`]. + /// + /// # Panics + /// + /// This method panics if that token is not cancelled. + /// This is low-risk because we're calling this during process shutdown, so, a panic + /// will be informative but not cause undue downtime. + pub async fn wait(self) { + let Self { + cancel, + uploader, + downloader, + } = self; + assert!( + cancel.is_cancelled(), + "must cancel cancellation token, otherwise the tasks will not shut down" + ); + + let (uploader, downloader) = futures::future::join(uploader, downloader).await; + uploader.expect( + "unreachable: exit_on_panic_or_error would catch the panic and exit the process", + ); + downloader.expect( + "unreachable: exit_on_panic_or_error would catch the panic and exit the process", + ); + } +} + pub fn spawn_tasks( tenant_manager: Arc, remote_storage: GenericRemoteStorage, background_jobs_can_start: Barrier, cancel: CancellationToken, -) -> SecondaryController { +) -> (SecondaryController, GlobalTasks) { let mgr_clone = tenant_manager.clone(); let storage_clone = remote_storage.clone(); - let cancel_clone = cancel.clone(); let bg_jobs_clone = background_jobs_can_start.clone(); let (download_req_tx, download_req_rx) = @@ -309,17 +345,9 @@ pub fn spawn_tasks( let (upload_req_tx, upload_req_rx) = tokio::sync::mpsc::channel::>(16); - let downloader_task_ctx = RequestContext::new( - TaskKind::SecondaryDownloads, - crate::context::DownloadBehavior::Download, - ); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - downloader_task_ctx.task_kind(), - None, - None, + let cancel_clone = cancel.clone(); + let downloader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "secondary tenant downloads", - false, async move { downloader_task( mgr_clone, @@ -327,49 +355,41 @@ pub fn spawn_tasks( download_req_rx, bg_jobs_clone, cancel_clone, - downloader_task_ctx, + RequestContext::new( + TaskKind::SecondaryDownloads, + crate::context::DownloadBehavior::Download, + ), ) .await; - - Ok(()) + anyhow::Ok(()) }, - ); + )); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::SecondaryUploads, - None, - None, + let cancel_clone = cancel.clone(); + let uploader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "heatmap uploads", - false, async move { heatmap_uploader_task( tenant_manager, remote_storage, upload_req_rx, background_jobs_can_start, - cancel, + cancel_clone, ) .await; - - Ok(()) + anyhow::Ok(()) }, - ); + )); - SecondaryController { - download_req_tx, - upload_req_tx, - } -} - -/// For running with remote storage disabled: a SecondaryController that is connected to nothing. -pub fn null_controller() -> SecondaryController { - let (download_req_tx, _download_req_rx) = - tokio::sync::mpsc::channel::>(16); - let (upload_req_tx, _upload_req_rx) = - tokio::sync::mpsc::channel::>(16); - SecondaryController { - upload_req_tx, - download_req_tx, - } + ( + SecondaryController { + upload_req_tx, + download_req_tx, + }, + GlobalTasks { + cancel, + uploader, + downloader, + }, + ) } diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 23354417e7..41d558d3f6 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -135,11 +135,9 @@ pub struct TimelineInputs { ancestor_lsn: Lsn, last_record: Lsn, latest_gc_cutoff: Lsn, - horizon_cutoff: Lsn, - pitr_cutoff: Lsn, /// Cutoff point based on GC settings - next_gc_cutoff: Lsn, + next_pitr_cutoff: Lsn, /// Cutoff point calculated from the user-supplied 'max_retention_period' retention_param_cutoff: Option, @@ -150,7 +148,7 @@ pub struct TimelineInputs { /// Gathers the inputs for the tenant sizing model. /// -/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which +/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which /// is updated on-demand, during the start of this calculation and separate from the /// [`TimelineInputs::latest_gc_cutoff`]. /// @@ -158,11 +156,8 @@ pub struct TimelineInputs { /// /// ```text /// 0-----|---------|----|------------| · · · · · |·> lsn -/// initdb_lsn branchpoints* next_gc_cutoff latest +/// initdb_lsn branchpoints* next_pitr_cutoff latest /// ``` -/// -/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the -/// tenant size will be zero. pub(super) async fn gather_inputs( tenant: &Tenant, limit: &Arc, @@ -172,7 +167,7 @@ pub(super) async fn gather_inputs( cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { - // refresh is needed to update gc related pitr_cutoff and horizon_cutoff + // refresh is needed to update [`timeline::GcCutoffs`] tenant.refresh_gc_info(cancel, ctx).await?; // Collect information about all the timelines @@ -236,20 +231,18 @@ pub(super) async fn gather_inputs( // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not // actually removing files. // - // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from + // We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather - // than a space bound (horizon cutoff). This means that if someone drops a database and waits for their + // than our internal space cutoff. This means that if someone drops a database and waits for their // PITR interval, they will see synthetic size decrease, even if we are still storing data inside - // horizon_cutoff. - let pitr_cutoff = gc_info.cutoffs.pitr; - let horizon_cutoff = gc_info.cutoffs.horizon; - let mut next_gc_cutoff = pitr_cutoff; + // the space cutoff. + let mut next_pitr_cutoff = gc_info.cutoffs.time; // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period)); - if next_gc_cutoff < param_cutoff { - next_gc_cutoff = param_cutoff; + if next_pitr_cutoff < param_cutoff { + next_pitr_cutoff = param_cutoff; } Some(param_cutoff) } else { @@ -263,7 +256,7 @@ pub(super) async fn gather_inputs( .copied() .collect::>(); - // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we + // next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we // want to query any logical size before initdb_lsn. let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn); @@ -271,10 +264,10 @@ pub(super) async fn gather_inputs( let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() - .filter(|&&lsn| lsn > ancestor_lsn) + .filter(|(lsn, _child_id)| lsn > &ancestor_lsn) .copied() // this assumes there are no other retain_lsns than the branchpoints - .map(|lsn| (lsn, LsnKind::BranchPoint)) + .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint)) .collect::>(); lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); @@ -291,10 +284,10 @@ pub(super) async fn gather_inputs( ) } - // Add a point for the GC cutoff - let branch_start_needed = next_gc_cutoff <= branch_start_lsn; + // Add a point for the PITR cutoff + let branch_start_needed = next_pitr_cutoff <= branch_start_lsn; if !branch_start_needed { - lsns.push((next_gc_cutoff, LsnKind::GcCutOff)); + lsns.push((next_pitr_cutoff, LsnKind::GcCutOff)); } lsns.sort_unstable(); @@ -333,7 +326,7 @@ pub(super) async fn gather_inputs( parent: Some(parent), lsn: lsn.0, size: None, - needed: lsn > next_gc_cutoff, + needed: lsn > next_pitr_cutoff, }, timeline_id: timeline.timeline_id, kind, @@ -357,8 +350,8 @@ pub(super) async fn gather_inputs( segment: Segment { parent: Some(lease_parent), lsn: lsn.0, - size: None, // Filled in later, if necessary - needed: lsn > next_gc_cutoff, // only needed if the point is within rentention. + size: None, // Filled in later, if necessary + needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention. }, timeline_id: timeline.timeline_id, kind: LsnKind::LeaseStart, @@ -398,9 +391,7 @@ pub(super) async fn gather_inputs( last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), - horizon_cutoff, - pitr_cutoff, - next_gc_cutoff, + next_pitr_cutoff, retention_param_cutoff, lease_points, }); @@ -742,9 +733,7 @@ fn verify_size_for_multiple_branches() { "ancestor_lsn": "0/18D3D98", "last_record": "0/2230CD0", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/2210CD0", - "pitr_cutoff": "0/2210CD0", - "next_gc_cutoff": "0/2210CD0", + "next_pitr_cutoff": "0/2210CD0", "retention_param_cutoff": null, "lease_points": [] }, @@ -753,9 +742,7 @@ fn verify_size_for_multiple_branches() { "ancestor_lsn": "0/176D998", "last_record": "0/1837770", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/1817770", - "pitr_cutoff": "0/1817770", - "next_gc_cutoff": "0/1817770", + "next_pitr_cutoff": "0/1817770", "retention_param_cutoff": null, "lease_points": [] }, @@ -764,9 +751,7 @@ fn verify_size_for_multiple_branches() { "ancestor_lsn": "0/0", "last_record": "0/18D3D98", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/18B3D98", - "pitr_cutoff": "0/18B3D98", - "next_gc_cutoff": "0/18B3D98", + "next_pitr_cutoff": "0/18B3D98", "retention_param_cutoff": null, "lease_points": [] } @@ -820,9 +805,7 @@ fn verify_size_for_one_branch() { "ancestor_lsn": "0/0", "last_record": "47/280A5860", "latest_gc_cutoff": "47/240A5860", - "horizon_cutoff": "47/240A5860", - "pitr_cutoff": "47/240A5860", - "next_gc_cutoff": "47/240A5860", + "next_pitr_cutoff": "47/240A5860", "retention_param_cutoff": "0/0", "lease_points": [] } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 62730f88b2..f931341aca 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -6,35 +6,22 @@ pub(crate) mod inmemory_layer; pub(crate) mod layer; mod layer_desc; mod layer_name; - -#[cfg(test)] pub mod merge_iterator; use crate::context::{AccessStatsBehavior, RequestContext}; use crate::repository::Value; -use crate::task_mgr::TaskKind; use crate::walrecord::NeonWalRecord; use bytes::Bytes; -use enum_map::EnumMap; -use enumset::EnumSet; -use once_cell::sync::Lazy; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; -use pageserver_api::models::{ - LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, -}; -use std::borrow::Cow; use std::cmp::{Ordering, Reverse}; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; use std::ops::Range; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use tracing::warn; -use utils::history_buffer::HistoryBufferWithDropCounter; -use utils::rate_limit::RateLimit; -use utils::{id::TimelineId, lsn::Lsn}; +use utils::lsn::Lsn; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; pub use image_layer::{ImageLayer, ImageLayerWriter}; @@ -77,9 +64,9 @@ where /// call, to collect more records. /// #[derive(Debug, Default)] -pub struct ValueReconstructState { - pub records: Vec<(Lsn, NeonWalRecord)>, - pub img: Option<(Lsn, Bytes)>, +pub(crate) struct ValueReconstructState { + pub(crate) records: Vec<(Lsn, NeonWalRecord)>, + pub(crate) img: Option<(Lsn, Bytes)>, } #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] @@ -460,94 +447,92 @@ pub enum ValueReconstructResult { Missing, } -#[derive(Debug)] -pub struct LayerAccessStats(Mutex); - -/// This struct holds two instances of [`LayerAccessStatsInner`]. -/// Accesses are recorded to both instances. -/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`]. -/// The `for_eviction_policy` is never reset. -#[derive(Debug, Default, Clone)] -struct LayerAccessStatsLocked { - for_scraping_api: LayerAccessStatsInner, - for_eviction_policy: LayerAccessStatsInner, +/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather +/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility +/// of layers (for example when creating a branch that makes some previously covered layers visible). It should +/// be used for cache management but not for correctness-critical checks. +#[derive(Default, Debug, Clone, PartialEq, Eq)] +pub(crate) enum LayerVisibilityHint { + /// A Visible layer might be read while serving a read, because there is not an image layer between it + /// and a readable LSN (the tip of the branch or a child's branch point) + Visible, + /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates + /// a branch or ephemeral endpoint at an LSN below the layer that covers this. + #[allow(unused)] + Covered, + /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded + /// in this state. Note that newly written layers may be called Visible immediately, this uninitialized + /// state is for when existing layers are constructed while loading a timeline. + #[default] + Uninitialized, } -impl LayerAccessStatsLocked { - fn iter_mut(&mut self) -> impl Iterator { - [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter() - } -} - -#[derive(Debug, Default, Clone)] -struct LayerAccessStatsInner { - first_access: Option, - count_by_access_kind: EnumMap, - task_kind_flag: EnumSet, - last_accesses: HistoryBufferWithDropCounter, - last_residence_changes: HistoryBufferWithDropCounter, -} - -#[derive(Debug, Clone, Copy)] -pub(crate) struct LayerAccessStatFullDetails { - pub(crate) when: SystemTime, - pub(crate) task_kind: TaskKind, - pub(crate) access_kind: LayerAccessKind, -} +pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64); #[derive(Clone, Copy, strum_macros::EnumString)] -pub enum LayerAccessStatsReset { +pub(crate) enum LayerAccessStatsReset { NoReset, - JustTaskKindFlags, AllStats, } -fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 { - ts.duration_since(UNIX_EPOCH) - .expect("better to die in this unlikely case than report false stats") - .as_millis() - .try_into() - .expect("64 bits is enough for few more years") -} +impl Default for LayerAccessStats { + fn default() -> Self { + // Default value is to assume resident since creation time, and visible. + let (_mask, mut value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, SystemTime::now()); + value |= 0x1 << Self::VISIBILITY_SHIFT; -impl LayerAccessStatFullDetails { - fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails { - let Self { - when, - task_kind, - access_kind, - } = self; - pageserver_api::models::LayerAccessStatFullDetails { - when_millis_since_epoch: system_time_to_millis_since_epoch(when), - task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros - access_kind: *access_kind, - } + Self(std::sync::atomic::AtomicU64::new(value)) } } +// Efficient store of two very-low-resolution timestamps and some bits. Used for storing last access time and +// last residence change time. impl LayerAccessStats { - /// Create an empty stats object. - /// - /// The caller is responsible for recording a residence event - /// using [`record_residence_event`] before calling `latest_activity`. - /// If they don't, [`latest_activity`] will return `None`. - /// - /// [`record_residence_event`]: Self::record_residence_event - /// [`latest_activity`]: Self::latest_activity - pub(crate) fn empty_will_record_residence_event_later() -> Self { - LayerAccessStats(Mutex::default()) + // How many high bits to drop from a u32 timestamp? + // - Only storing up to a u32 timestamp will work fine until 2038 (if this code is still in use + // after that, this software has been very successful!) + // - Dropping the top bit is implicitly safe because unix timestamps are meant to be + // stored in an i32, so they never used it. + // - Dropping the next two bits is safe because this code is only running on systems in + // years >= 2024, and these bits have been 1 since 2021 + // + // Therefore we may store only 28 bits for a timestamp with one second resolution. We do + // this truncation to make space for some flags in the high bits of our u64. + const TS_DROP_HIGH_BITS: u32 = u32::count_ones(Self::TS_ONES) + 1; + const TS_MASK: u32 = 0x1f_ff_ff_ff; + const TS_ONES: u32 = 0x60_00_00_00; + + const ATIME_SHIFT: u32 = 0; + const RTIME_SHIFT: u32 = 32 - Self::TS_DROP_HIGH_BITS; + const VISIBILITY_SHIFT: u32 = 64 - 2 * Self::TS_DROP_HIGH_BITS; + + fn write_bits(&self, mask: u64, value: u64) -> u64 { + self.0 + .fetch_update( + // TODO: decide what orderings are correct + std::sync::atomic::Ordering::Relaxed, + std::sync::atomic::Ordering::Relaxed, + |v| Some((v & !mask) | (value & mask)), + ) + .expect("Inner function is infallible") } - /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status. - /// - /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. - /// - /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad - /// [`record_residence_event`]: Self::record_residence_event - pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self { - let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); - new.record_residence_event(status, LayerResidenceEventReason::LayerLoad); - new + fn to_low_res_timestamp(shift: u32, time: SystemTime) -> (u64, u64) { + // Drop the low three bits of the timestamp, for an ~8s accuracy + let timestamp = time.duration_since(UNIX_EPOCH).unwrap().as_secs() & (Self::TS_MASK as u64); + + ((Self::TS_MASK as u64) << shift, timestamp << shift) + } + + fn read_low_res_timestamp(&self, shift: u32) -> Option { + let read = self.0.load(std::sync::atomic::Ordering::Relaxed); + + let ts_bits = (read & ((Self::TS_MASK as u64) << shift)) >> shift; + if ts_bits == 0 { + None + } else { + Some(UNIX_EPOCH + Duration::from_secs(ts_bits | (Self::TS_ONES as u64))) + } } /// Record a change in layer residency. @@ -563,129 +548,112 @@ impl LayerAccessStats { /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map. /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock. /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event. - /// - pub(crate) fn record_residence_event( - &self, - status: LayerResidenceStatus, - reason: LayerResidenceEventReason, - ) { - let mut locked = self.0.lock().unwrap(); - locked.iter_mut().for_each(|inner| { - inner - .last_residence_changes - .write(LayerResidenceEvent::new(status, reason)) - }); + pub(crate) fn record_residence_event_at(&self, now: SystemTime) { + let (mask, value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, now); + self.write_bits(mask, value); } - fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) { + pub(crate) fn record_residence_event(&self) { + self.record_residence_event_at(SystemTime::now()) + } + + pub(crate) fn record_access_at(&self, now: SystemTime) { + let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now); + + // A layer which is accessed must be visible. + mask |= 0x1 << Self::VISIBILITY_SHIFT; + value |= 0x1 << Self::VISIBILITY_SHIFT; + + self.write_bits(mask, value); + } + + pub(crate) fn record_access(&self, ctx: &RequestContext) { if ctx.access_stats_behavior() == AccessStatsBehavior::Skip { return; } - let this_access = LayerAccessStatFullDetails { - when: SystemTime::now(), - task_kind: ctx.task_kind(), - access_kind, - }; - - let mut locked = self.0.lock().unwrap(); - locked.iter_mut().for_each(|inner| { - inner.first_access.get_or_insert(this_access); - inner.count_by_access_kind[access_kind] += 1; - inner.task_kind_flag |= ctx.task_kind(); - inner.last_accesses.write(this_access); - }) + self.record_access_at(SystemTime::now()) } fn as_api_model( &self, reset: LayerAccessStatsReset, ) -> pageserver_api::models::LayerAccessStats { - let mut locked = self.0.lock().unwrap(); - let inner = &mut locked.for_scraping_api; - let LayerAccessStatsInner { - first_access, - count_by_access_kind, - task_kind_flag, - last_accesses, - last_residence_changes, - } = inner; let ret = pageserver_api::models::LayerAccessStats { - access_count_by_access_kind: count_by_access_kind - .iter() - .map(|(kind, count)| (kind, *count)) - .collect(), - task_kind_access_flag: task_kind_flag - .iter() - .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros - .collect(), - first: first_access.as_ref().map(|a| a.as_api_model()), - accesses_history: last_accesses.map(|m| m.as_api_model()), - residence_events_history: last_residence_changes.clone(), + access_time: self + .read_low_res_timestamp(Self::ATIME_SHIFT) + .unwrap_or(UNIX_EPOCH), + residence_time: self + .read_low_res_timestamp(Self::RTIME_SHIFT) + .unwrap_or(UNIX_EPOCH), + visible: matches!(self.visibility(), LayerVisibilityHint::Visible), }; match reset { - LayerAccessStatsReset::NoReset => (), - LayerAccessStatsReset::JustTaskKindFlags => { - inner.task_kind_flag.clear(); - } + LayerAccessStatsReset::NoReset => {} LayerAccessStatsReset::AllStats => { - *inner = LayerAccessStatsInner::default(); + self.write_bits((Self::TS_MASK as u64) << Self::ATIME_SHIFT, 0x0); + self.write_bits((Self::TS_MASK as u64) << Self::RTIME_SHIFT, 0x0); } } ret } - /// Get the latest access timestamp, falling back to latest residence event, further falling - /// back to `SystemTime::now` for a usable timestamp for eviction. - pub(crate) fn latest_activity_or_now(&self) -> SystemTime { - self.latest_activity().unwrap_or_else(SystemTime::now) + /// Get the latest access timestamp, falling back to latest residence event. The latest residence event + /// will be this Layer's construction time, if its residence hasn't changed since then. + pub(crate) fn latest_activity(&self) -> SystemTime { + if let Some(t) = self.read_low_res_timestamp(Self::ATIME_SHIFT) { + t + } else { + self.read_low_res_timestamp(Self::RTIME_SHIFT) + .expect("Residence time is set on construction") + } } - /// Get the latest access timestamp, falling back to latest residence event. + /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]). /// - /// This function can only return `None` if there has not yet been a call to the - /// [`record_residence_event`] method. That would generally be considered an - /// implementation error. This function logs a rate-limited warning in that case. - /// - /// TODO: use type system to avoid the need for `fallback`. - /// The approach in - /// could be used to enforce that a residence event is recorded - /// before a layer is added to the layer map. We could also have - /// a layer wrapper type that holds the LayerAccessStats, and ensure - /// that that type can only be produced by inserting into the layer map. - /// - /// [`record_residence_event`]: Self::record_residence_event - fn latest_activity(&self) -> Option { - let locked = self.0.lock().unwrap(); - let inner = &locked.for_eviction_policy; - match inner.last_accesses.recent() { - Some(a) => Some(a.when), - None => match inner.last_residence_changes.recent() { - Some(e) => Some(e.timestamp), - None => { - static WARN_RATE_LIMIT: Lazy> = - Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10))))); - let mut guard = WARN_RATE_LIMIT.lock().unwrap(); - guard.0 += 1; - let occurences = guard.0; - guard.1.call(move || { - warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value"); - }); - None - } - }, + /// This indicates whether the layer has been used for some purpose that would motivate + /// us to keep it on disk, such as for serving a getpage request. + fn accessed(&self) -> bool { + // Consider it accessed if the most recent access is more recent than + // the most recent change in residence status. + match ( + self.read_low_res_timestamp(Self::ATIME_SHIFT), + self.read_low_res_timestamp(Self::RTIME_SHIFT), + ) { + (None, _) => false, + (Some(_), None) => true, + (Some(a), Some(r)) => a >= r, + } + } + + pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) { + let value = match visibility { + LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT, + LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0, + }; + + self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value); + } + + pub(crate) fn visibility(&self) -> LayerVisibilityHint { + let read = self.0.load(std::sync::atomic::Ordering::Relaxed); + match (read >> Self::VISIBILITY_SHIFT) & 0x1 { + 1 => LayerVisibilityHint::Visible, + 0 => LayerVisibilityHint::Covered, + _ => unreachable!(), } } } /// Get a layer descriptor from a layer. -pub trait AsLayerDesc { +pub(crate) trait AsLayerDesc { /// Get the layer descriptor. fn layer_desc(&self) -> &PersistentLayerDesc; } pub mod tests { use pageserver_api::shard::TenantShardId; + use utils::id::TimelineId; use super::*; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index dfd0196c87..586a7b7836 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -33,11 +33,14 @@ use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader}; -use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::disk_btree::{ + DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, +}; use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, + BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + VectoredReadPlanner, }; use crate::tenant::{PageReconstructError, Timeline}; use crate::virtual_file::{self, VirtualFile}; @@ -49,10 +52,11 @@ use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind}; +use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; +use std::collections::VecDeque; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; @@ -261,7 +265,7 @@ impl DeltaLayer { return Ok(()); } - let inner = self.load(LayerAccessKind::Dump, ctx).await?; + let inner = self.load(ctx).await?; inner.dump(ctx).await } @@ -294,12 +298,8 @@ impl DeltaLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - async fn load( - &self, - access_kind: LayerAccessKind, - ctx: &RequestContext, - ) -> Result<&Arc> { - self.access_stats.record_access(access_kind, ctx); + async fn load(&self, ctx: &RequestContext) -> Result<&Arc> { + self.access_stats.record_access(ctx); // Quick exit if already loaded self.inner .get_or_try_init(|| self.load_inner(ctx)) @@ -352,7 +352,7 @@ impl DeltaLayer { summary.lsn_range, metadata.len(), ), - access_stats: LayerAccessStats::empty_will_record_residence_event_later(), + access_stats: Default::default(), inner: OnceCell::new(), }) } @@ -456,7 +456,12 @@ impl DeltaLayerWriterInner { will_init: bool, ctx: &RequestContext, ) -> (Vec, anyhow::Result<()>) { - assert!(self.lsn_range.start <= lsn); + assert!( + self.lsn_range.start <= lsn, + "lsn_start={}, lsn={}", + self.lsn_range.start, + lsn + ); // We don't want to use compression in delta layer creation let compression = ImageCompressionAlgorithm::Disabled; let (val, res) = self @@ -747,12 +752,10 @@ impl DeltaLayer { } impl DeltaLayerInner { - #[cfg(test)] pub(crate) fn key_range(&self) -> &Range { &self.layer_key_range } - #[cfg(test)] pub(crate) fn lsn_range(&self) -> &Range { &self.layer_lsn_range } @@ -1180,9 +1183,7 @@ impl DeltaLayerInner { let delta_key = DeltaKey::from_slice(key); let val_ref = ValueRef { blob_ref: BlobRef(value), - reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter( - Adapter(self), - )), + layer: self, }; let pos = BlobRef(value).pos(); if let Some(last) = all_keys.last_mut() { @@ -1321,7 +1322,7 @@ impl DeltaLayerInner { offsets.start.pos(), offsets.end.pos(), meta, - Some(max_read_size), + max_read_size, )) } } else { @@ -1426,7 +1427,7 @@ impl DeltaLayerInner { let keys = self.load_keys(ctx).await?; async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result { - let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; + let buf = val.load_raw(ctx).await?; let val = Value::des(&buf)?; let desc = match val { Value::Image(img) => { @@ -1461,8 +1462,7 @@ impl DeltaLayerInner { use pageserver_api::key::CHECKPOINT_KEY; use postgres_ffi::CheckPoint; if key == CHECKPOINT_KEY { - let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; - let val = Value::des(&buf)?; + let val = val.load(ctx).await?; match val { Value::Image(img) => { let checkpoint = CheckPoint::decode(&img)?; @@ -1515,7 +1515,6 @@ impl DeltaLayerInner { offset } - #[cfg(test)] pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> { let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = @@ -1526,7 +1525,7 @@ impl DeltaLayerInner { index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx), key_values_batch: std::collections::VecDeque::new(), is_end: false, - planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new( + planner: StreamingVectoredReadPlanner::new( 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. 1024, // The default value. Unit tests might use a different value ), @@ -1547,17 +1546,24 @@ pub struct DeltaEntry<'a> { /// Reference to an on-disk value pub struct ValueRef<'a> { blob_ref: BlobRef, - reader: BlockCursor<'a>, + layer: &'a DeltaLayerInner, } impl<'a> ValueRef<'a> { /// Loads the value from disk pub async fn load(&self, ctx: &RequestContext) -> Result { - // theoretically we *could* record an access time for each, but it does not really matter - let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?; + let buf = self.load_raw(ctx).await?; let val = Value::des(&buf)?; Ok(val) } + + async fn load_raw(&self, ctx: &RequestContext) -> Result> { + let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter( + self.layer, + ))); + let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?; + Ok(buf) + } } pub(crate) struct Adapter(T); @@ -1591,17 +1597,15 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del } } -#[cfg(test)] pub struct DeltaLayerIterator<'a> { delta_layer: &'a DeltaLayerInner, ctx: &'a RequestContext, - planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner, - index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>, - key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>, + planner: StreamingVectoredReadPlanner, + index_iter: DiskBtreeIterator<'a>, + key_values_batch: VecDeque<(Key, Lsn, Value)>, is_end: bool, } -#[cfg(test)] impl<'a> DeltaLayerIterator<'a> { /// Retrieve a batch of key-value pairs into the iterator buffer. async fn next_batch(&mut self) -> anyhow::Result<()> { @@ -1615,13 +1619,17 @@ impl<'a> DeltaLayerIterator<'a> { let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); let blob_ref = BlobRef(value); let offset = blob_ref.pos(); - if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) { + if let Some(batch_plan) = self.planner.handle(key, lsn, offset) { break batch_plan; } } else { self.is_end = true; let data_end_offset = self.delta_layer.index_start_offset(); - break self.planner.handle_range_end(data_end_offset); + if let Some(item) = self.planner.handle_range_end(data_end_offset) { + break item; + } else { + return Ok(()); // TODO: test empty iterator + } } }; let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file); @@ -1664,6 +1672,7 @@ pub(crate) mod test { use rand::RngCore; use super::*; + use crate::repository::Value; use crate::tenant::harness::TIMELINE_ID; use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::Tenant; @@ -1673,6 +1682,7 @@ pub(crate) mod test { tenant::{disk_btree::tests::TestDisk, harness::TenantHarness}, DEFAULT_PG_VERSION, }; + use bytes::Bytes; /// Construct an index for a fictional delta layer and and then /// traverse in order to plan vectored reads for a query. Finally, @@ -1925,7 +1935,7 @@ pub(crate) mod test { #[tokio::test] async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?; + let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?; let (tenant, ctx) = harness.load().await; let timeline_id = TimelineId::generate(); @@ -2025,7 +2035,9 @@ pub(crate) mod test { use crate::walrecord::NeonWalRecord; use bytes::Bytes; - let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap(); + let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let ctx = &ctx; let timeline = tenant @@ -2241,6 +2253,15 @@ pub(crate) mod test { (k1, l1).cmp(&(k2, l2)) } + pub(crate) fn sort_delta_value( + (k1, l1, v1): &(Key, Lsn, Value), + (k2, l2, v2): &(Key, Lsn, Value), + ) -> std::cmp::Ordering { + let order_1 = if v1.is_image() { 0 } else { 1 }; + let order_2 = if v2.is_image() { 0 } else { 1 }; + (k1, l1, order_1).cmp(&(k2, l2, order_2)) + } + pub(crate) async fn produce_delta_layer( tenant: &Tenant, tline: &Arc, @@ -2249,7 +2270,7 @@ pub(crate) mod test { ) -> anyhow::Result { deltas.sort_by(sort_delta); let (key_start, _, _) = deltas.first().unwrap(); - let (key_max, _, _) = deltas.first().unwrap(); + let (key_max, _, _) = deltas.last().unwrap(); let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap(); let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap(); let lsn_end = Lsn(lsn_max.0 + 1); @@ -2294,10 +2315,7 @@ pub(crate) mod test { #[tokio::test] async fn delta_layer_iterator() { - use crate::repository::Value; - use bytes::Bytes; - - let harness = TenantHarness::create("delta_layer_iterator").unwrap(); + let harness = TenantHarness::create("delta_layer_iterator").await.unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 1e03e1a58c..e5e7f71928 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -29,13 +29,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; -use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::disk_btree::{ + DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, +}; use crate::tenant::storage_layer::{ LayerAccessStats, ValueReconstructResult, ValueReconstructState, }; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, + BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + VectoredReadPlanner, }; use crate::tenant::{PageReconstructError, Timeline}; use crate::virtual_file::{self, VirtualFile}; @@ -46,10 +49,10 @@ use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; +use std::collections::VecDeque; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; @@ -224,7 +227,7 @@ impl ImageLayer { return Ok(()); } - let inner = self.load(LayerAccessKind::Dump, ctx).await?; + let inner = self.load(ctx).await?; inner.dump(ctx).await?; @@ -251,12 +254,8 @@ impl ImageLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - async fn load( - &self, - access_kind: LayerAccessKind, - ctx: &RequestContext, - ) -> Result<&ImageLayerInner> { - self.access_stats.record_access(access_kind, ctx); + async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> { + self.access_stats.record_access(ctx); self.inner .get_or_try_init(|| self.load_inner(ctx)) .await @@ -308,7 +307,7 @@ impl ImageLayer { metadata.len(), ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: summary.lsn, - access_stats: LayerAccessStats::empty_will_record_residence_event_later(), + access_stats: Default::default(), inner: OnceCell::new(), }) } @@ -369,12 +368,10 @@ impl ImageLayer { } impl ImageLayerInner { - #[cfg(test)] pub(crate) fn key_range(&self) -> &Range { &self.key_range } - #[cfg(test)] pub(crate) fn lsn(&self) -> Lsn { self.lsn } @@ -699,7 +696,6 @@ impl ImageLayerInner { } } - #[cfg(test)] pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> { let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = @@ -708,9 +704,9 @@ impl ImageLayerInner { image_layer: self, ctx, index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx), - key_values_batch: std::collections::VecDeque::new(), + key_values_batch: VecDeque::new(), is_end: false, - planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new( + planner: StreamingVectoredReadPlanner::new( 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. 1024, // The default value. Unit tests might use a different value ), @@ -737,6 +733,9 @@ struct ImageLayerWriterInner { key_range: Range, lsn: Lsn, + // Total uncompressed bytes passed into put_image + uncompressed_bytes: u64, + blob_writer: BlobWriter, tree: DiskBtreeBuilder, } @@ -792,6 +791,7 @@ impl ImageLayerWriterInner { lsn, tree: tree_builder, blob_writer, + uncompressed_bytes: 0, }; Ok(writer) @@ -809,7 +809,12 @@ impl ImageLayerWriterInner { ctx: &RequestContext, ) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); - let (_img, res) = self.blob_writer.write_blob(img, ctx).await; + let compression = self.conf.image_compression; + self.uncompressed_bytes += img.len() as u64; + let (_img, res) = self + .blob_writer + .write_blob_maybe_compressed(img, ctx, compression) + .await; // TODO: re-use the buffer for `img` further upstack let off = res?; @@ -831,6 +836,11 @@ impl ImageLayerWriterInner { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + // Calculate compression ratio + let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header + crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes); + crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size); + let mut file = self.blob_writer.into_inner(); // Write out the index @@ -970,17 +980,15 @@ impl Drop for ImageLayerWriter { } } -#[cfg(test)] pub struct ImageLayerIterator<'a> { image_layer: &'a ImageLayerInner, ctx: &'a RequestContext, - planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner, - index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>, - key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>, + planner: StreamingVectoredReadPlanner, + index_iter: DiskBtreeIterator<'a>, + key_values_batch: VecDeque<(Key, Lsn, Value)>, is_end: bool, } -#[cfg(test)] impl<'a> ImageLayerIterator<'a> { /// Retrieve a batch of key-value pairs into the iterator buffer. async fn next_batch(&mut self) -> anyhow::Result<()> { @@ -994,14 +1002,17 @@ impl<'a> ImageLayerIterator<'a> { Key::from_slice(&raw_key[..KEY_SIZE]), self.image_layer.lsn, offset, - BlobFlag::None, ) { break batch_plan; } } else { self.is_end = true; let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64; - break self.planner.handle_range_end(payload_end); + if let Some(item) = self.planner.handle_range_end(payload_end) { + break item; + } else { + return Ok(()); // TODO: a test case on empty iterator + } } }; let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file); @@ -1095,6 +1106,7 @@ mod test { ShardIdentity::unsharded(), get_next_gen(), ) + .await .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant @@ -1161,6 +1173,7 @@ mod test { // But here, all we care about is that the gen number is unique. get_next_gen(), ) + .await .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant @@ -1292,7 +1305,7 @@ mod test { #[tokio::test] async fn image_layer_iterator() { - let harness = TenantHarness::create("image_layer_iterator").unwrap(); + let harness = TenantHarness::create("image_layer_iterator").await.unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 5941a52e98..f9010ae8a6 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -18,7 +18,7 @@ use anyhow::{anyhow, ensure, Result}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use std::collections::{BTreeMap, BinaryHeap, HashSet}; +use std::collections::BTreeMap; use std::sync::{Arc, OnceLock}; use std::time::Instant; use tracing::*; @@ -375,15 +375,6 @@ impl InMemoryLayer { let inner = self.inner.read().await; let reader = inner.file.block_cursor(); - #[derive(Eq, PartialEq, Ord, PartialOrd)] - struct BlockRead { - key: Key, - lsn: Lsn, - block_offset: u64, - } - - let mut planned_block_reads = BinaryHeap::new(); - for range in keyspace.ranges.iter() { for (key, vec_map) in inner.index.range(range.start..range.end) { let lsn_range = match reconstruct_state.get_cached_lsn(key) { @@ -392,49 +383,32 @@ impl InMemoryLayer { }; let slice = vec_map.slice_range(lsn_range); + for (entry_lsn, pos) in slice.iter().rev() { - planned_block_reads.push(BlockRead { - key: *key, - lsn: *entry_lsn, - block_offset: *pos, - }); + // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 + let buf = reader.read_blob(*pos, &ctx).await; + if let Err(e) = buf { + reconstruct_state + .on_key_error(*key, PageReconstructError::from(anyhow!(e))); + break; + } + + let value = Value::des(&buf.unwrap()); + if let Err(e) = value { + reconstruct_state + .on_key_error(*key, PageReconstructError::from(anyhow!(e))); + break; + } + + let key_situation = + reconstruct_state.update_key(key, *entry_lsn, value.unwrap()); + if key_situation == ValueReconstructSituation::Complete { + break; + } } } } - let keyspace_size = keyspace.total_raw_size(); - - let mut completed_keys = HashSet::new(); - while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() { - let block_read = planned_block_reads.pop().unwrap(); - if completed_keys.contains(&block_read.key) { - continue; - } - - // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 - let buf = reader.read_blob(block_read.block_offset, &ctx).await; - if let Err(e) = buf { - reconstruct_state - .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e))); - completed_keys.insert(block_read.key); - continue; - } - - let value = Value::des(&buf.unwrap()); - if let Err(e) = value { - reconstruct_state - .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e))); - completed_keys.insert(block_read.key); - continue; - } - - let key_situation = - reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap()); - if key_situation == ValueReconstructSituation::Complete { - completed_keys.insert(block_read.key); - } - } - reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn); Ok(()) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 02069c29d2..1db3e7c675 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1,9 +1,7 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::{ - HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, -}; +use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; @@ -160,13 +158,10 @@ impl Layer { metadata.file_size, ); - let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted); - let owner = Layer(Arc::new(LayerInner::new( conf, timeline, local_path, - access_stats, desc, None, metadata.generation, @@ -193,8 +188,6 @@ impl Layer { metadata.file_size, ); - let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident); - let mut resident = None; let owner = Layer(Arc::new_cyclic(|owner| { @@ -209,7 +202,6 @@ impl Layer { conf, timeline, local_path, - access_stats, desc, Some(inner), metadata.generation, @@ -245,11 +237,6 @@ impl Layer { version: 0, }); resident = Some(inner.clone()); - let access_stats = LayerAccessStats::empty_will_record_residence_event_later(); - access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::LayerCreate, - ); let local_path = local_layer_path( conf, @@ -259,16 +246,22 @@ impl Layer { &timeline.generation, ); - LayerInner::new( + let layer = LayerInner::new( conf, timeline, local_path, - access_stats, desc, Some(inner), timeline.generation, timeline.get_shard_index(), - ) + ); + + // Newly created layers are marked visible by default: the usual case is that they were created to be read. + layer + .access_stats + .set_visibility(super::LayerVisibilityHint::Visible); + + layer })); let downloaded = resident.expect("just initialized"); @@ -332,9 +325,7 @@ impl Layer { use anyhow::ensure; let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?; - self.0 - .access_stats - .record_access(LayerAccessKind::GetValueReconstructData, ctx); + self.0.access_stats.record_access(ctx); if self.layer_desc().is_delta { ensure!(lsn_range.start >= self.layer_desc().lsn_range.start); @@ -368,9 +359,7 @@ impl Layer { other => GetVectoredError::Other(anyhow::anyhow!(other)), })?; - self.0 - .access_stats - .record_access(LayerAccessKind::GetValueReconstructData, ctx); + self.0.access_stats.record_access(ctx); layer .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) @@ -385,6 +374,7 @@ impl Layer { } /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future. + #[allow(dead_code)] pub(crate) async fn load_key_values( &self, ctx: &RequestContext, @@ -693,6 +683,18 @@ impl Drop for LayerInner { // and we could be delaying shutdown for nothing. } + if let Some(timeline) = self.timeline.upgrade() { + // Only need to decrement metrics if the timeline still exists: otherwise + // it will have already de-registered these metrics via TimelineMetrics::shutdown + if self.desc.is_delta() { + timeline.metrics.layer_count_delta.dec(); + timeline.metrics.layer_size_delta.sub(self.desc.file_size); + } else { + timeline.metrics.layer_count_image.dec(); + timeline.metrics.layer_size_image.sub(self.desc.file_size); + } + } + if !*self.wanted_deleted.get_mut() { return; } @@ -773,7 +775,6 @@ impl LayerInner { conf: &'static PageServerConf, timeline: &Arc, local_path: Utf8PathBuf, - access_stats: LayerAccessStats, desc: PersistentLayerDesc, downloaded: Option>, generation: Generation, @@ -791,6 +792,15 @@ impl LayerInner { (heavier_once_cell::OnceCell::default(), 0, Status::Evicted) }; + // This object acts as a RAII guard on these metrics: increment on construction + if desc.is_delta() { + timeline.metrics.layer_count_delta.inc(); + timeline.metrics.layer_size_delta.add(desc.file_size); + } else { + timeline.metrics.layer_count_image.inc(); + timeline.metrics.layer_size_image.add(desc.file_size); + } + LayerInner { conf, debug_str: { @@ -799,7 +809,7 @@ impl LayerInner { path: local_path, desc, timeline: Arc::downgrade(timeline), - access_stats, + access_stats: Default::default(), wanted_deleted: AtomicBool::new(false), inner, version: AtomicUsize::new(version), @@ -1154,10 +1164,7 @@ impl LayerInner { LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); } - self.access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::ResidenceChange, - ); + self.access_stats.record_residence_event(); Ok(self.initialize_after_layer_is_on_disk(permit)) } @@ -1276,7 +1283,7 @@ impl LayerInner { lsn_end: lsn_range.end, remote: !resident, access_stats, - l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()), + l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range), } } else { let lsn = self.desc.image_layer_lsn(); @@ -1469,14 +1476,22 @@ impl LayerInner { let duration = SystemTime::now().duration_since(local_layer_mtime); match duration { Ok(elapsed) => { - timeline - .metrics - .evictions_with_low_residence_duration - .read() - .unwrap() - .observe(elapsed); + let accessed = self.access_stats.accessed(); + if accessed { + // Only layers used for reads contribute to our "low residence" metric that is used + // to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed + // to be rapidly evicted without contributing to this metric. + timeline + .metrics + .evictions_with_low_residence_duration + .read() + .unwrap() + .observe(elapsed); + } + tracing::info!( residence_millis = elapsed.as_millis(), + accessed, "evicted layer after known residence period" ); } @@ -1503,10 +1518,7 @@ impl LayerInner { } } - self.access_stats.record_residence_event( - LayerResidenceStatus::Evicted, - LayerResidenceEventReason::ResidenceChange, - ); + self.access_stats.record_residence_event(); self.status.as_ref().unwrap().send_replace(Status::Evicted); @@ -1832,9 +1844,7 @@ impl ResidentLayer { // this is valid because the DownloadedLayer::kind is a OnceCell, not a // Mutex, so we cannot go and deinitialize the value with OnceCell::take // while it's being held. - owner - .access_stats - .record_access(LayerAccessKind::KeyIter, ctx); + owner.access_stats.record_access(ctx); delta_layer::DeltaLayerInner::load_keys(d, ctx) .await @@ -1889,7 +1899,7 @@ impl ResidentLayer { self.owner.metadata() } - #[cfg(test)] + /// Cast the layer to a delta, return an error if it is an image layer. pub(crate) async fn get_as_delta( &self, ctx: &RequestContext, @@ -1901,7 +1911,7 @@ impl ResidentLayer { } } - #[cfg(test)] + /// Cast the layer to an image, return an error if it is a delta layer. pub(crate) async fn get_as_image( &self, ctx: &RequestContext, diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 3a7aca7a6c..d5d2f748a9 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,3 +1,5 @@ +use std::time::UNIX_EPOCH; + use pageserver_api::key::CONTROLFILE_KEY; use tokio::task::JoinSet; use utils::{ @@ -7,7 +9,7 @@ use utils::{ use super::failpoints::{Failpoint, FailpointKind}; use super::*; -use crate::context::DownloadBehavior; +use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint}; use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; /// Used in tests to advance a future to wanted await point, and not futher. @@ -22,7 +24,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s async fn smoke_test() { let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("smoke_test").unwrap(); + let h = TenantHarness::create("smoke_test").await.unwrap(); let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let (tenant, _) = h.load().await; @@ -176,7 +178,9 @@ async fn evict_and_wait_on_wanted_deleted() { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap(); + let h = TenantHarness::create("evict_and_wait_on_wanted_deleted") + .await + .unwrap(); utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); let (tenant, ctx) = h.load().await; @@ -258,7 +262,9 @@ fn read_wins_pending_eviction() { rt.block_on(async move { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("read_wins_pending_eviction").unwrap(); + let h = TenantHarness::create("read_wins_pending_eviction") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); @@ -390,7 +396,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { rt.block_on(async move { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create(name).unwrap(); + let h = TenantHarness::create(name).await.unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); @@ -559,8 +565,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { #[tokio::test(start_paused = true)] async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let handle = tokio::runtime::Handle::current(); - let h = - TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap(); + let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let timeline = tenant @@ -636,7 +643,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { #[tokio::test(start_paused = true)] async fn evict_and_wait_does_not_wait_for_download() { // let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap(); + let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download") + .await + .unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); @@ -733,7 +742,9 @@ async fn eviction_cancellation_on_drop() { // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); - let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap(); + let h = TenantHarness::create("eviction_cancellation_on_drop") + .await + .unwrap(); utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); let (tenant, ctx) = h.load().await; @@ -817,9 +828,9 @@ async fn eviction_cancellation_on_drop() { #[test] #[cfg(target_arch = "x86_64")] fn layer_size() { - assert_eq!(std::mem::size_of::(), 2040); + assert_eq!(std::mem::size_of::(), 8); assert_eq!(std::mem::size_of::(), 104); - assert_eq!(std::mem::size_of::(), 2344); + assert_eq!(std::mem::size_of::(), 312); // it also has the utf8 path } @@ -959,3 +970,46 @@ fn spawn_blocking_pool_helper_actually_works() { println!("joined"); }); } + +/// Drop the low bits from a time, to emulate the precision loss in LayerAccessStats +fn lowres_time(hires: SystemTime) -> SystemTime { + let ts = hires.duration_since(UNIX_EPOCH).unwrap().as_secs(); + UNIX_EPOCH + Duration::from_secs(ts) +} + +#[test] +fn access_stats() { + let access_stats = LayerAccessStats::default(); + // Default is visible + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible); + + access_stats.set_visibility(LayerVisibilityHint::Covered); + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered); + access_stats.set_visibility(LayerVisibilityHint::Visible); + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible); + + let rtime = UNIX_EPOCH + Duration::from_secs(2000000000); + access_stats.record_residence_event_at(rtime); + assert_eq!(access_stats.latest_activity(), lowres_time(rtime)); + + let atime = UNIX_EPOCH + Duration::from_secs(2100000000); + access_stats.record_access_at(atime); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); + + // Setting visibility doesn't clobber access time + access_stats.set_visibility(LayerVisibilityHint::Covered); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); + access_stats.set_visibility(LayerVisibilityHint::Visible); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); +} + +#[test] +fn access_stats_2038() { + // The access stats structure uses a timestamp representation that will run out + // of bits in 2038. One year before that, this unit test will start failing. + + let one_year_from_now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap() + + Duration::from_secs(3600 * 24 * 365); + + assert!(one_year_from_now.as_secs() < (2 << 31)); +} diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index a89b66e4a1..bd765560e4 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -25,7 +25,7 @@ pub struct PersistentLayerDesc { /// /// - For an open in-memory layer, the end bound is MAX_LSN /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the - /// range start + /// range start /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 pub lsn_range: Range, /// Whether this is a delta layer, and also, is this incremental. diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index da26e1eeb7..f33ca076ab 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -248,6 +248,14 @@ impl LayerName { Image(_) => "image", } } + + /// Gets the key range encoded in the layer name. + pub fn key_range(&self) -> &Range { + match &self { + LayerName::Image(layer) => &layer.key_range, + LayerName::Delta(layer) => &layer.key_range, + } + } } impl fmt::Display for LayerName { diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index 36386c87c9..eb4a1f28a1 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -96,15 +96,22 @@ impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> { impl<'a> std::cmp::Ord for IteratorWrapper<'a> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { use std::cmp::Ordering; - let a = self.peek_next_key_lsn(); - let b = other.peek_next_key_lsn(); + let a = self.peek_next_key_lsn_value(); + let b = other.peek_next_key_lsn_value(); match (a, b) { - (Some((k1, l1)), Some((k2, l2))) => { - let loaded_1 = if self.is_loaded() { 1 } else { 0 }; - let loaded_2 = if other.is_loaded() { 1 } else { 0 }; + (Some((k1, l1, v1)), Some((k2, l2, v2))) => { + fn map_value_to_num(val: &Option<&Value>) -> usize { + match val { + None => 0, + Some(Value::Image(_)) => 1, + Some(Value::WalRecord(_)) => 2, + } + } + let order_1 = map_value_to_num(&v1); + let order_2 = map_value_to_num(&v2); // When key_lsn are the same, the unloaded iter will always appear before the loaded one. // And note that we do a reverse at the end of the comparison, so it works with the max heap. - (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2)) + (k1, l1, order_1).cmp(&(k2, l2, order_2)) } (Some(_), None) => Ordering::Less, (None, Some(_)) => Ordering::Greater, @@ -137,13 +144,16 @@ impl<'a> IteratorWrapper<'a> { } } - fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> { + fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> { match self { - Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)), + Self::Loaded { iter } => iter + .peek() + .as_ref() + .map(|(key, lsn, val)| (key, *lsn, Some(val))), Self::NotLoaded { first_key_lower_bound: (key, lsn), .. - } => Some((key, *lsn)), + } => Some((key, *lsn, None)), } } @@ -191,6 +201,13 @@ impl<'a> IteratorWrapper<'a> { } } +/// A merge iterator over delta/image layer iterators. When duplicated records are +/// found, the iterator will not perform any deduplication, and the caller should handle +/// these situation. By saying duplicated records, there are many possibilities: +/// * Two same delta at the same LSN. +/// * Two same image at the same LSN. +/// * Delta/image at the same LSN where the image has already applied the delta. +/// The iterator will always put the image before the delta. pub struct MergeIterator<'a> { heap: BinaryHeap>, } @@ -245,8 +262,9 @@ mod tests { use crate::{ tenant::{ harness::{TenantHarness, TIMELINE_ID}, - storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}, + storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value}, }, + walrecord::NeonWalRecord, DEFAULT_PG_VERSION, }; @@ -275,7 +293,9 @@ mod tests { use crate::repository::Value; use bytes::Bytes; - let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap(); + let harness = TenantHarness::create("merge_iterator_merge_in_between") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant @@ -338,7 +358,9 @@ mod tests { use crate::repository::Value; use bytes::Bytes; - let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap(); + let harness = TenantHarness::create("merge_iterator_delta_merge") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant @@ -407,6 +429,133 @@ mod tests { // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge } - // TODO: image layer merge, delta+image mixed merge - // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that + #[tokio::test] + async fn delta_image_mixed_merge() { + use crate::repository::Value; + use bytes::Bytes; + + let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + // In this test case, we want to test if the iterator still works correctly with multiple copies + // of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab. + // Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix. + // An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation + // could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation + // one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should + // correctly process these situations and return everything as-is, and the upper layer of the system + // will handle duplicated LSNs. + let test_deltas1 = vec![ + ( + get_key(0), + Lsn(0x10), + Value::WalRecord(NeonWalRecord::wal_init()), + ), + ( + get_key(0), + Lsn(0x18), + Value::WalRecord(NeonWalRecord::wal_append("a")), + ), + ( + get_key(5), + Lsn(0x10), + Value::WalRecord(NeonWalRecord::wal_init()), + ), + ( + get_key(5), + Lsn(0x18), + Value::WalRecord(NeonWalRecord::wal_append("b")), + ), + ]; + let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) + .await + .unwrap(); + let mut test_deltas2 = test_deltas1.clone(); + test_deltas2.push(( + get_key(10), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + )); + let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) + .await + .unwrap(); + let test_deltas3 = vec![ + ( + get_key(0), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"")), + ), + ( + get_key(5), + Lsn(0x18), + Value::Image(Bytes::copy_from_slice(b"b")), + ), + ( + get_key(15), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ]; + let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) + .await + .unwrap(); + let mut test_deltas4 = test_deltas3.clone(); + test_deltas4.push(( + get_key(20), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + )); + let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx) + .await + .unwrap(); + let mut expect = Vec::new(); + expect.extend(test_deltas1); + expect.extend(test_deltas2); + expect.extend(test_deltas3); + expect.extend(test_deltas4); + expect.sort_by(sort_delta_value); + + // Test with different layer order for MergeIterator::create to ensure the order + // is stable. + + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_4.get_as_delta(&ctx).await.unwrap(), + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + resident_layer_3.get_as_delta(&ctx).await.unwrap(), + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + resident_layer_4.get_as_delta(&ctx).await.unwrap(), + resident_layer_3.get_as_delta(&ctx).await.unwrap(), + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + + is_send(merge_iter); + } + + fn is_send(_: impl Send) {} } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index d679b78f32..7f59e54eb7 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -101,7 +101,6 @@ pub fn start_background_loops( Some(tenant_shard_id), None, &format!("compactor for tenant {tenant_shard_id}"), - false, { let tenant = Arc::clone(tenant); let background_jobs_can_start = background_jobs_can_start.cloned(); @@ -125,7 +124,6 @@ pub fn start_background_loops( Some(tenant_shard_id), None, &format!("garbage collector for tenant {tenant_shard_id}"), - false, { let tenant = Arc::clone(tenant); let background_jobs_can_start = background_jobs_can_start.cloned(); @@ -149,7 +147,6 @@ pub fn start_background_loops( Some(tenant_shard_id), None, &format!("ingest housekeeping for tenant {tenant_shard_id}"), - false, { let tenant = Arc::clone(tenant); let background_jobs_can_start = background_jobs_can_start.cloned(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 762e903bf8..178b707aa7 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,5 +1,5 @@ pub(crate) mod analysis; -mod compaction; +pub(crate) mod compaction; pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; @@ -69,6 +69,7 @@ use std::{ use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ + config::defaults::DEFAULT_PITR_INTERVAL, layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, storage_layer::PersistentLayerDesc, @@ -197,7 +198,7 @@ impl PartialOrd for Hole { /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. /// Can be removed after all refactors are done. -fn drop_rlock(rlock: tokio::sync::OwnedRwLockReadGuard) { +fn drop_rlock(rlock: tokio::sync::RwLockReadGuard) { drop(rlock) } @@ -270,7 +271,7 @@ pub struct Timeline { /// /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`, /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`. - pub(crate) layers: Arc>, + pub(crate) layers: tokio::sync::RwLock, last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. @@ -459,7 +460,7 @@ pub(crate) struct GcInfo { /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub(crate) retain_lsns: Vec, + pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>, /// The cutoff coordinates, which are combined by selecting the minimum. pub(crate) cutoffs: GcCutoffs, @@ -475,39 +476,43 @@ impl GcInfo { pub(crate) fn min_cutoff(&self) -> Lsn { self.cutoffs.select_min() } + + pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) { + self.retain_lsns.push((child_lsn, child_id)); + self.retain_lsns.sort_by_key(|i| i.0); + } + + pub(super) fn remove_child(&mut self, child_id: TimelineId) { + self.retain_lsns.retain(|i| i.1 != child_id); + } } -/// The `GcInfo` component describing which Lsns need to be retained. -#[derive(Debug)] +/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this +/// is a single number (the oldest LSN which we must retain), but it internally distinguishes +/// between time-based and space-based retention for observability and consumption metrics purposes. +#[derive(Debug, Clone)] pub(crate) struct GcCutoffs { - /// Keep everything newer than this point. - /// - /// This is calculated by subtracting 'gc_horizon' setting from - /// last-record LSN - /// - /// FIXME: is this inclusive or exclusive? - pub(crate) horizon: Lsn, + /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much + /// history we must keep to retain a specified number of bytes of WAL. + pub(crate) space: Lsn, - /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this - /// point. - /// - /// This is calculated by finding a number such that a record is needed for PITR - /// if only if its LSN is larger than 'pitr_cutoff'. - pub(crate) pitr: Lsn, + /// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much + /// history we must keep to enable reading back at least the PITR interval duration. + pub(crate) time: Lsn, } impl Default for GcCutoffs { fn default() -> Self { Self { - horizon: Lsn::INVALID, - pitr: Lsn::INVALID, + space: Lsn::INVALID, + time: Lsn::INVALID, } } } impl GcCutoffs { fn select_min(&self) -> Lsn { - std::cmp::min(self.horizon, self.pitr) + std::cmp::min(self.space, self.time) } } @@ -866,7 +871,7 @@ impl Timeline { let gc_info = self.gc_info.read().unwrap(); let history = self .get_last_record_lsn() - .checked_sub(gc_info.cutoffs.pitr) + .checked_sub(gc_info.cutoffs.time) .unwrap_or(Lsn(0)) .0; (history, gc_info.within_ancestor_pitr) @@ -1565,7 +1570,7 @@ impl Timeline { ) -> anyhow::Result<()> { ensure!( lsn >= **latest_gc_cutoff_lsn, - "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", + "LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)", lsn, **latest_gc_cutoff_lsn, ); @@ -2311,6 +2316,11 @@ impl Timeline { ) }; + if let Some(ancestor) = &ancestor { + let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); + ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn()); + } + Arc::new_cyclic(|myself| { let metrics = TimelineMetrics::new( &tenant_shard_id, @@ -2481,7 +2491,6 @@ impl Timeline { Some(self.tenant_shard_id), Some(self.timeline_id), "layer flush task", - false, async move { let _guard = guard; let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); @@ -2826,7 +2835,6 @@ impl Timeline { Some(self.tenant_shard_id), Some(self.timeline_id), "initial size calculation", - false, // NB: don't log errors here, task_mgr will do that. async move { let cancel = task_mgr::shutdown_token(); @@ -2995,7 +3003,6 @@ impl Timeline { Some(self.tenant_shard_id), Some(self.timeline_id), "ondemand logical size calculation", - false, async move { let res = self_clone .logical_size_calculation_task(lsn, cause, &ctx) @@ -3162,7 +3169,7 @@ impl Timeline { let guard = self.layers.read().await; let resident = guard.likely_resident_layers().map(|layer| { - let last_activity_ts = layer.access_stats().latest_activity_or_now(); + let last_activity_ts = layer.access_stats().latest_activity(); HeatMapLayer::new( layer.layer_desc().layer_name(), @@ -3408,6 +3415,8 @@ impl Timeline { } } + #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint + #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace /// /// The algorithm is as follows: @@ -4474,10 +4483,10 @@ impl Timeline { /// are required. Since checking if new image layers are required is expensive in /// terms of CPU, we only do it in the following cases: /// 1. If the timeline has ingested sufficient WAL to justify the cost - /// 2. If enough time has passed since the last check - /// 2.1. For large tenants, we wish to perform the check more often since they - /// suffer from the lack of image layers - /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval + /// 2. If enough time has passed since the last check: + /// 1. For large tenants, we wish to perform the check more often since they + /// suffer from the lack of image layers + /// 2. For small tenants (that can mostly fit in RAM), we use a much longer interval fn should_check_if_image_layers_required(self: &Arc, lsn: Lsn) -> bool { const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024; @@ -4719,7 +4728,7 @@ impl Timeline { /// Requires a timeline that: /// - has an ancestor to detach from /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not - /// a technical requirement + /// a technical requirement /// /// After the operation has been started, it cannot be canceled. Upon restart it needs to be /// polled again until completion. @@ -4731,13 +4740,7 @@ impl Timeline { tenant: &crate::tenant::Tenant, options: detach_ancestor::Options, ctx: &RequestContext, - ) -> Result< - ( - completion::Completion, - detach_ancestor::PreparedTimelineDetach, - ), - detach_ancestor::Error, - > { + ) -> Result { detach_ancestor::prepare(self, tenant, options, ctx).await } @@ -4764,6 +4767,18 @@ impl Timeline { } } +impl Drop for Timeline { + fn drop(&mut self) { + if let Some(ancestor) = &self.ancestor_timeline { + // This lock should never be poisoned, but in case it is we do a .map() instead of + // an unwrap(), to avoid panicking in a destructor and thereby aborting the process. + if let Ok(mut gc_info) = ancestor.gc_info.write() { + gc_info.remove_child(self.timeline_id) + } + } + } +} + /// Top-level failure to compact. #[derive(Debug, thiserror::Error)] pub(crate) enum CompactionError { @@ -4876,7 +4891,7 @@ impl Timeline { // for compact_level0_phase1 creating an L0, which does not happen in practice // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); - } else if LayerMap::is_l0(l.layer_desc()) { + } else if LayerMap::is_l0(&l.layer_desc().key_range) { bail!("compaction generates a L0 layer file as output, which will cause infinite compaction."); } else { insert_layers.push(l.clone()); @@ -4944,24 +4959,21 @@ impl Timeline { } /// Find the Lsns above which layer files need to be retained on - /// garbage collection. This is separate from actually performing the GC, - /// and is updated more frequently, so that compaction can remove obsolete - /// page versions more aggressively. + /// garbage collection. /// - /// TODO: that's wishful thinking, compaction doesn't actually do that - /// currently. + /// We calculate two cutoffs, one based on time and one based on WAL size. `pitr` + /// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls + /// the space-based retention. /// - /// The 'cutoff_horizon' point is used to retain recent versions that might still be - /// needed by read-only nodes. (As of this writing, the caller just passes - /// the latest LSN subtracted by a constant, and doesn't do anything smart - /// to figure out what read-only nodes might actually need.) - /// - /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine - /// whether a record is needed for PITR. + /// This function doesn't simply to calculate time & space based retention: it treats time-based + /// retention as authoritative if enabled, and falls back to space-based retention if calculating + /// the LSN for a time point isn't possible. Therefore the GcCutoffs::horizon in the response might + /// be different to the `space_cutoff` input. Callers should treat the min() of the two cutoffs + /// in the response as the GC cutoff point for the timeline. #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] pub(super) async fn find_gc_cutoffs( &self, - cutoff_horizon: Lsn, + space_cutoff: Lsn, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, @@ -4974,58 +4986,87 @@ impl Timeline { pausable_failpoint!("Timeline::find_gc_cutoffs-pausable"); - // First, calculate pitr_cutoff_timestamp and then convert it to LSN. - // - // Some unit tests depend on garbage-collection working even when - // CLOG data is missing, so that find_lsn_for_timestamp() doesn't - // work, so avoid calling it altogether if time-based retention is not - // configured. It would be pointless anyway. - let pitr_cutoff = if pitr != Duration::ZERO { - let now = SystemTime::now(); - if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { - let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - - match self - .find_lsn_for_timestamp(pitr_timestamp, cancel, ctx) - .await? - { - LsnForTimestamp::Present(lsn) => lsn, - LsnForTimestamp::Future(lsn) => { - // The timestamp is in the future. That sounds impossible, - // but what it really means is that there hasn't been - // any commits since the cutoff timestamp. - // - // In this case we should use the LSN of the most recent commit, - // which is implicitly the last LSN in the log. - debug!("future({})", lsn); - self.get_last_record_lsn() - } - LsnForTimestamp::Past(lsn) => { - debug!("past({})", lsn); - // conservative, safe default is to remove nothing, when we - // have no commit timestamp data available - *self.get_latest_gc_cutoff_lsn() - } - LsnForTimestamp::NoData(lsn) => { - debug!("nodata({})", lsn); - // conservative, safe default is to remove nothing, when we - // have no commit timestamp data available - *self.get_latest_gc_cutoff_lsn() - } - } - } else { - // If we don't have enough data to convert to LSN, - // play safe and don't remove any layers. - *self.get_latest_gc_cutoff_lsn() + if cfg!(test) { + // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup + if pitr == Duration::ZERO { + return Ok(GcCutoffs { + time: self.get_last_record_lsn(), + space: space_cutoff, + }); + } + } + + // Calculate a time-based limit on how much to retain: + // - if PITR interval is set, then this is our cutoff. + // - if PITR interval is not set, then we do a lookup + // based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases. + let time_cutoff = { + let now = SystemTime::now(); + let time_range = if pitr == Duration::ZERO { + humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") + } else { + pitr + }; + + // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case) + let time_cutoff = now.checked_sub(time_range).unwrap_or(now); + let timestamp = to_pg_timestamp(time_cutoff); + + match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? { + LsnForTimestamp::Present(lsn) => Some(lsn), + LsnForTimestamp::Future(lsn) => { + // The timestamp is in the future. That sounds impossible, + // but what it really means is that there hasn't been + // any commits since the cutoff timestamp. + // + // In this case we should use the LSN of the most recent commit, + // which is implicitly the last LSN in the log. + debug!("future({})", lsn); + Some(self.get_last_record_lsn()) + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + None + } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + None + } } - } else { - // No time-based retention was configured. Interpret this as "keep no history". - self.get_last_record_lsn() }; - Ok(GcCutoffs { - horizon: cutoff_horizon, - pitr: pitr_cutoff, + Ok(match (pitr, time_cutoff) { + (Duration::ZERO, Some(time_cutoff)) => { + // PITR is not set. Retain the size-based limit, or the default time retention, + // whichever requires less data. + GcCutoffs { + time: self.get_last_record_lsn(), + space: std::cmp::max(time_cutoff, space_cutoff), + } + } + (Duration::ZERO, None) => { + // PITR is not set, and time lookup failed + GcCutoffs { + time: self.get_last_record_lsn(), + space: space_cutoff, + } + } + (_, None) => { + // PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR + // cannot advance beyond what was already GC'd, and respect space-based retention + GcCutoffs { + time: *self.get_latest_gc_cutoff_lsn(), + space: space_cutoff, + } + } + (_, Some(time_cutoff)) => { + // PITR interval is set and we looked up timestamp successfully. Ignore + // size based retention and make time cutoff authoritative + GcCutoffs { + time: time_cutoff, + space: time_cutoff, + } + } }) } @@ -5050,12 +5091,16 @@ impl Timeline { return Err(GcError::TimelineCancelled); } - let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = { + let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = { let gc_info = self.gc_info.read().unwrap(); - let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn()); - let pitr_cutoff = gc_info.cutoffs.pitr; - let retain_lsns = gc_info.retain_lsns.clone(); + let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn()); + let time_cutoff = gc_info.cutoffs.time; + let retain_lsns = gc_info + .retain_lsns + .iter() + .map(|(lsn, _child_id)| *lsn) + .collect(); // Gets the maximum LSN that holds the valid lease. // @@ -5064,14 +5109,14 @@ impl Timeline { let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn); ( - horizon_cutoff, - pitr_cutoff, + space_cutoff, + time_cutoff, retain_lsns, max_lsn_with_valid_lease, ) }; - let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff); let standby_horizon = self.standby_horizon.load(); // Hold GC for the standby, but as a safety guard do it only within some // reasonable lag. @@ -5100,8 +5145,8 @@ impl Timeline { let res = self .gc_timeline( - horizon_cutoff, - pitr_cutoff, + space_cutoff, + time_cutoff, retain_lsns, max_lsn_with_valid_lease, new_gc_cutoff, @@ -5119,8 +5164,8 @@ impl Timeline { async fn gc_timeline( &self, - horizon_cutoff: Lsn, - pitr_cutoff: Lsn, + space_cutoff: Lsn, + time_cutoff: Lsn, retain_lsns: Vec, max_lsn_with_valid_lease: Option, new_gc_cutoff: Lsn, @@ -5181,22 +5226,22 @@ impl Timeline { result.layers_total += 1; // 1. Is it newer than GC horizon cutoff point? - if l.get_lsn_range().end > horizon_cutoff { + if l.get_lsn_range().end > space_cutoff { debug!( - "keeping {} because it's newer than horizon_cutoff {}", + "keeping {} because it's newer than space_cutoff {}", l.layer_name(), - horizon_cutoff, + space_cutoff, ); result.layers_needed_by_cutoff += 1; continue 'outer; } // 2. It is newer than PiTR cutoff point? - if l.get_lsn_range().end > pitr_cutoff { + if l.get_lsn_range().end > time_cutoff { debug!( - "keeping {} because it's newer than pitr_cutoff {}", + "keeping {} because it's newer than time_cutoff {}", l.layer_name(), - pitr_cutoff, + time_cutoff, ); result.layers_needed_by_pitr += 1; continue 'outer; @@ -5417,7 +5462,6 @@ impl Timeline { Some(self.tenant_shard_id), Some(self.timeline_id), "download all remote layers task", - false, async move { self_clone.download_all_remote_layers(request).await; let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); @@ -5568,7 +5612,7 @@ impl Timeline { let file_size = layer.layer_desc().file_size; max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); - let last_activity_ts = layer.access_stats().latest_activity_or_now(); + let last_activity_ts = layer.access_stats().latest_activity(); EvictionCandidate { layer: layer.into(), @@ -6028,8 +6072,9 @@ mod tests { #[tokio::test] async fn two_layer_eviction_attempts_at_the_same_time() { - let harness = - TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap(); + let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time") + .await + .unwrap(); let (tenant, ctx) = harness.load().await; let timeline = tenant diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index efaa6144af..d0a74e3924 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -26,15 +26,17 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; -use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc}; -use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome}; -use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter}; +use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD}; +use crate::tenant::storage_layer::merge_iterator::MergeIterator; +use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState}; +use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; +use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::DeltaLayer; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use crate::keyspace::KeySpace; -use crate::repository::Key; +use crate::repository::{Key, Value}; use utils::lsn::Lsn; @@ -43,6 +45,60 @@ use pageserver_compaction::interface::*; use super::CompactionError; +/// Maximum number of deltas before generating an image layer in bottom-most compaction. +const COMPACTION_DELTA_THRESHOLD: usize = 5; + +/// The result of bottom-most compaction for a single key at each LSN. +#[derive(Debug)] +#[cfg_attr(test, derive(PartialEq))] +pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>); + +/// The result of bottom-most compaction. +#[derive(Debug)] +#[cfg_attr(test, derive(PartialEq))] +pub(crate) struct KeyHistoryRetention { + /// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN. + pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>, + /// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN. + pub(crate) above_horizon: KeyLogAtLsn, +} + +impl KeyHistoryRetention { + async fn pipe_to( + self, + key: Key, + delta_writer: &mut Vec<(Key, Lsn, Value)>, + image_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let mut first_batch = true; + for (_, KeyLogAtLsn(logs)) in self.below_horizon { + if first_batch { + if logs.len() == 1 && logs[0].1.is_image() { + let Value::Image(img) = &logs[0].1 else { + unreachable!() + }; + image_writer.put_image(key, img.clone(), ctx).await?; + } else { + for (lsn, val) in logs { + delta_writer.push((key, lsn, val)); + } + } + first_batch = false; + } else { + for (lsn, val) in logs { + delta_writer.push((key, lsn, val)); + } + } + } + let KeyLogAtLsn(above_horizon_logs) = self.above_horizon; + for (lsn, val) in above_horizon_logs { + delta_writer.push((key, lsn, val)); + } + Ok(()) + } +} + impl Timeline { /// TODO: cancellation pub(crate) async fn compact_legacy( @@ -195,7 +251,7 @@ impl Timeline { tracing::info!( "latest_gc_cutoff: {}, pitr cutoff {}", *latest_gc_cutoff, - self.gc_info.read().unwrap().cutoffs.pitr + self.gc_info.read().unwrap().cutoffs.time ); let layers = self.layers.read().await; @@ -379,7 +435,7 @@ impl Timeline { }; let begin = tokio::time::Instant::now(); - let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await; + let phase1_layers_locked = self.layers.read().await; let now = tokio::time::Instant::now(); stats.read_lock_acquisition_micros = DurationRecorder::Recorded(RecordedDuration(now - begin), now); @@ -399,9 +455,9 @@ impl Timeline { } /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. - async fn compact_level0_phase1( - self: &Arc, - guard: tokio::sync::OwnedRwLockReadGuard, + async fn compact_level0_phase1<'a>( + self: &'a Arc, + guard: tokio::sync::RwLockReadGuard<'a, LayerManager>, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, ctx: &RequestContext, @@ -415,6 +471,7 @@ impl Timeline { .map(|x| guard.get_from_desc(&x)) .collect_vec(); stats.level0_deltas_count = Some(level0_deltas.len()); + // Only compact if enough layers have accumulated. let threshold = self.get_compaction_threshold(); if level0_deltas.is_empty() || level0_deltas.len() < threshold { @@ -445,6 +502,22 @@ impl Timeline { let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); + // Accumulate the size of layers in `deltas_to_compact` + let mut deltas_to_compact_bytes = 0; + + // Under normal circumstances, we will accumulate up to compaction_interval L0s of size + // checkpoint_distance each. To avoid edge cases using extra system resources, bound our + // work in this function to only operate on this much delta data at once. + // + // Take the max of the configured value & the default, so that tests that configure tiny values + // can still use a sensible amount of memory, but if a deployed system configures bigger values we + // still let them compact a full stack of L0s in one go. + let delta_size_limit = std::cmp::max( + self.get_compaction_threshold(), + DEFAULT_COMPACTION_THRESHOLD, + ) as u64 + * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE); + deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; @@ -453,7 +526,20 @@ impl Timeline { break; } deltas_to_compact.push(l.download_and_keep_resident().await?); + deltas_to_compact_bytes += l.metadata().file_size; prev_lsn_end = lsn_range.end; + + if deltas_to_compact_bytes >= delta_size_limit { + info!( + l0_deltas_selected = deltas_to_compact.len(), + l0_deltas_total = level0_deltas.len(), + "L0 compaction picker hit max delta layer size limit: {}", + delta_size_limit + ); + + // Proceed with compaction, but only a subset of L0s + break; + } } let lsn_range = Range { start: deltas_to_compact @@ -957,6 +1043,188 @@ impl Timeline { Ok(()) } + /// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns. + /// + /// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon. + /// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is + /// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch. + /// + /// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have: + /// + /// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60 + /// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3 + /// + /// The function will produce: + /// + /// ```plain + /// 0x20(retain_lsn) -> img=AB@0x20 always produce a single image below the lowest retain LSN + /// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40] two deltas since the last base image, keeping the deltas + /// 0x50(horizon) -> deltas=[ABCDE@0x50] three deltas since the last base image, generate an image but put it in the delta + /// above_horizon -> deltas=[+F@0x60] full history above the horizon + /// ``` + /// + /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key. + pub(crate) async fn generate_key_retention( + self: &Arc, + key: Key, + history: &[(Key, Lsn, Value)], + horizon: Lsn, + retain_lsn_below_horizon: &[Lsn], + delta_threshold_cnt: usize, + ) -> anyhow::Result { + // Pre-checks for the invariants + if cfg!(debug_assertions) { + for (log_key, _, _) in history { + assert_eq!(log_key, &key, "mismatched key"); + } + for i in 1..history.len() { + assert!(history[i - 1].1 <= history[i].1, "unordered LSN"); + if history[i - 1].1 == history[i].1 { + assert!( + matches!(history[i - 1].2, Value::Image(_)), + "unordered delta/image, or duplicated delta" + ); + } + } + if let Value::WalRecord(rec) = &history[0].2 { + assert!(rec.will_init(), "no base image"); + } + for lsn in retain_lsn_below_horizon { + assert!(lsn < &horizon, "retain lsn must be below horizon") + } + for i in 1..retain_lsn_below_horizon.len() { + assert!( + retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i], + "unordered LSN" + ); + } + } + // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon, + // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket. + let (mut split_history, lsn_split_points) = { + let mut split_history = Vec::new(); + split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new); + let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1); + for lsn in retain_lsn_below_horizon { + lsn_split_points.push(*lsn); + } + lsn_split_points.push(horizon); + let mut current_idx = 0; + for item @ (_, lsn, _) in history { + while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] { + current_idx += 1; + } + split_history[current_idx].push(item); + } + (split_history, lsn_split_points) + }; + // Step 2: filter out duplicated records due to the k-merge of image/delta layers + for split_for_lsn in &mut split_history { + let mut prev_lsn = None; + let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len()); + for record @ (_, lsn, _) in std::mem::take(split_for_lsn) { + if let Some(prev_lsn) = &prev_lsn { + if *prev_lsn == lsn { + // The case that we have an LSN with both data from the delta layer and the image layer. As + // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply + // drop this delta and keep the image. + // + // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will + // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply + // dropped. + continue; + } + } + prev_lsn = Some(lsn); + new_split_for_lsn.push(record); + } + *split_for_lsn = new_split_for_lsn; + } + // Step 3: generate images when necessary + let mut retention = Vec::with_capacity(split_history.len()); + let mut records_since_last_image = 0; + let batch_cnt = split_history.len(); + assert!( + batch_cnt >= 2, + "should have at least below + above horizon batches" + ); + let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new(); + for (i, split_for_lsn) in split_history.into_iter().enumerate() { + records_since_last_image += split_for_lsn.len(); + let generate_image = if i == 0 { + // We always generate images for the first batch (below horizon / lowest retain_lsn) + true + } else if i == batch_cnt - 1 { + // Do not generate images for the last batch (above horizon) + false + } else if records_since_last_image >= delta_threshold_cnt { + // Generate images when there are too many records + true + } else { + false + }; + replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone())); + if let Some((_, _, val)) = replay_history.first() { + assert!(val.will_init(), "invalid history, no base image"); + } + // Only retain the items after the last image record + for idx in (0..replay_history.len()).rev() { + if replay_history[idx].2.will_init() { + replay_history = replay_history[idx..].to_vec(); + break; + } + } + if generate_image && records_since_last_image > 0 { + records_since_last_image = 0; + let history = std::mem::take(&mut replay_history); + let mut img = None; + let mut records = Vec::with_capacity(history.len()); + if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() { + img = Some((*lsn, val.clone())); + for (_, lsn, val) in history.into_iter().skip(1) { + let Value::WalRecord(rec) = val else { + panic!("invalid record") + }; + records.push((lsn, rec)); + } + } else { + for (_, lsn, val) in history.into_iter() { + let Value::WalRecord(rec) = val else { + panic!("invalid record") + }; + records.push((lsn, rec)); + } + } + records.reverse(); + let state = ValueReconstructState { img, records }; + let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range + let img = self.reconstruct_value(key, request_lsn, state).await?; + replay_history.push((key, request_lsn, Value::Image(img.clone()))); + retention.push(vec![(request_lsn, Value::Image(img))]); + } else { + retention.push( + split_for_lsn + .iter() + .map(|(_, lsn, value)| (*lsn, value.clone())) + .collect(), + ); + } + } + let mut result = Vec::with_capacity(retention.len()); + assert_eq!(retention.len(), lsn_split_points.len() + 1); + for (idx, logs) in retention.into_iter().enumerate() { + if idx == lsn_split_points.len() { + return Ok(KeyHistoryRetention { + below_horizon: result, + above_horizon: KeyLogAtLsn(logs), + }); + } else { + result.push((lsn_split_points[idx], KeyLogAtLsn(logs))); + } + } + unreachable!() + } + /// An experimental compaction building block that combines compaction with garbage collection. /// /// The current implementation picks all delta + image layers that are below or intersecting with @@ -968,7 +1236,6 @@ impl Timeline { _cancel: &CancellationToken, ctx: &RequestContext, ) -> Result<(), CompactionError> { - use crate::tenant::storage_layer::ValueReconstructState; use std::collections::BTreeSet; info!("running enhanced gc bottom-most compaction"); @@ -981,37 +1248,60 @@ impl Timeline { // The layer selection has the following properties: // 1. If a layer is in the selection, all layers below it are in the selection. // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. - let (layer_selection, gc_cutoff) = { + let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = { let guard = self.layers.read().await; let layers = guard.layer_map(); let gc_info = self.gc_info.read().unwrap(); - if !gc_info.retain_lsns.is_empty() || !gc_info.leases.is_empty() { - return Err(CompactionError::Other(anyhow!( - "enhanced legacy compaction currently does not support retain_lsns (branches)" - ))); + let mut retain_lsns_below_horizon = Vec::new(); + let gc_cutoff = gc_info.cutoffs.select_min(); + for (lsn, _timeline_id) in &gc_info.retain_lsns { + if lsn < &gc_cutoff { + retain_lsns_below_horizon.push(*lsn); + } + } + for lsn in gc_info.leases.keys() { + if lsn < &gc_cutoff { + retain_lsns_below_horizon.push(*lsn); + } } - let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr); let mut selected_layers = Vec::new(); - // TODO: consider retain_lsns drop(gc_info); for desc in layers.iter_historic_layers() { if desc.get_lsn_range().start <= gc_cutoff { selected_layers.push(guard.get_from_desc(&desc)); } } - (selected_layers, gc_cutoff) + retain_lsns_below_horizon.sort(); + (selected_layers, gc_cutoff, retain_lsns_below_horizon) }; + let lowest_retain_lsn = retain_lsns_below_horizon + .first() + .copied() + .unwrap_or(gc_cutoff); + if cfg!(debug_assertions) { + assert_eq!( + lowest_retain_lsn, + retain_lsns_below_horizon + .iter() + .min() + .copied() + .unwrap_or(gc_cutoff) + ); + } info!( - "picked {} layers for compaction with gc_cutoff={}", + "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}", layer_selection.len(), - gc_cutoff + gc_cutoff, + lowest_retain_lsn ); // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. // Also, collect the layer information to decide when to split the new delta layers. - let mut all_key_values = Vec::new(); + let mut downloaded_layers = Vec::new(); let mut delta_split_points = BTreeSet::new(); for layer in &layer_selection { - all_key_values.extend(layer.load_key_values(ctx).await?); + let resident_layer = layer.download_and_keep_resident().await?; + downloaded_layers.push(resident_layer); + let desc = layer.layer_desc(); if desc.is_delta() { // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon) @@ -1021,86 +1311,22 @@ impl Timeline { delta_split_points.insert(key_range.end); } } - // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and - // image layers, make image appear before than delta. - struct ValueWrapper<'a>(&'a crate::repository::Value); - impl Ord for ValueWrapper<'_> { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - use crate::repository::Value; - use std::cmp::Ordering; - match (self.0, other.0) { - (Value::Image(_), Value::WalRecord(_)) => Ordering::Less, - (Value::WalRecord(_), Value::Image(_)) => Ordering::Greater, - _ => Ordering::Equal, - } + let mut delta_layers = Vec::new(); + let mut image_layers = Vec::new(); + for resident_layer in &downloaded_layers { + if resident_layer.layer_desc().is_delta() { + let layer = resident_layer.get_as_delta(ctx).await?; + delta_layers.push(layer); + } else { + let layer = resident_layer.get_as_image(ctx).await?; + image_layers.push(layer); } } - impl PartialOrd for ValueWrapper<'_> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } - } - impl PartialEq for ValueWrapper<'_> { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == std::cmp::Ordering::Equal - } - } - impl Eq for ValueWrapper<'_> {} - all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| { - (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2))) - }); + let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx); // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas. // Data of the same key. let mut accumulated_values = Vec::new(); - let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty - - /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon. - async fn flush_accumulated_states( - tline: &Arc, - key: Key, - accumulated_values: &[&(Key, Lsn, crate::repository::Value)], - horizon: Lsn, - ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> { - let mut base_image = None; - let mut keys_above_horizon = Vec::new(); - let mut delta_above_base_image = Vec::new(); - // We have a list of deltas/images. We want to create image layers while collect garbages. - for (key, lsn, val) in accumulated_values.iter().rev() { - if *lsn > horizon { - if let Some((_, prev_lsn, _)) = keys_above_horizon.last_mut() { - if *prev_lsn == *lsn { - // The case that we have an LSN with both data from the delta layer and the image layer. As - // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply - // drop this delta and keep the image. - // - // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will - // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply - // dropped. - continue; - } - } - keys_above_horizon.push((*key, *lsn, val.clone())); - } else if *lsn <= horizon { - match val { - crate::repository::Value::Image(image) => { - base_image = Some((*lsn, image.clone())); - break; - } - crate::repository::Value::WalRecord(wal) => { - delta_above_base_image.push((*lsn, wal.clone())); - } - } - } - } - // do not reverse delta_above_base_image, reconstruct state expects reversely-ordered records - keys_above_horizon.reverse(); - let state = ValueReconstructState { - img: base_image, - records: delta_above_base_image, - }; - let img = tline.reconstruct_value(key, horizon, state).await?; - Ok((keys_above_horizon, img)) - } + let mut last_key: Option = None; async fn flush_deltas( deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>, @@ -1108,7 +1334,7 @@ impl Timeline { delta_split_points: &[Key], current_delta_split_point: &mut usize, tline: &Arc, - gc_cutoff: Lsn, + lowest_retain_lsn: Lsn, ctx: &RequestContext, ) -> anyhow::Result> { // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid @@ -1143,7 +1369,7 @@ impl Timeline { tline.timeline_id, tline.tenant_shard_id, deltas.first().unwrap().0, - gc_cutoff..end_lsn, + lowest_retain_lsn..end_lsn, ctx, ) .await?; @@ -1159,8 +1385,8 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, - &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()), - gc_cutoff, + &(Key::MIN..Key::MAX), // covers the full key range + lowest_retain_lsn, ctx, ) .await?; @@ -1169,40 +1395,60 @@ impl Timeline { let delta_split_points = delta_split_points.into_iter().collect_vec(); let mut current_delta_split_point = 0; let mut delta_layers = Vec::new(); - for item @ (key, _, _) in &all_key_values { - if &last_key == key { - accumulated_values.push(item); + while let Some((key, lsn, val)) = merge_iter.next().await? { + if last_key.is_none() || last_key.as_ref() == Some(&key) { + if last_key.is_none() { + last_key = Some(key); + } + accumulated_values.push((key, lsn, val)); } else { - let (deltas, image) = - flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff) - .await?; + let last_key = last_key.as_mut().unwrap(); + let retention = self + .generate_key_retention( + *last_key, + &accumulated_values, + gc_cutoff, + &retain_lsns_below_horizon, + COMPACTION_DELTA_THRESHOLD, + ) + .await?; // Put the image into the image layer. Currently we have a single big layer for the compaction. - image_layer_writer.put_image(last_key, image, ctx).await?; - delta_values.extend(deltas); + retention + .pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx) + .await?; delta_layers.extend( flush_deltas( &mut delta_values, - last_key, + *last_key, &delta_split_points, &mut current_delta_split_point, self, - gc_cutoff, + lowest_retain_lsn, ctx, ) .await?, ); accumulated_values.clear(); - accumulated_values.push(item); - last_key = *key; + *last_key = key; + accumulated_values.push((key, lsn, val)); } } + let last_key = last_key.expect("no keys produced during compaction"); // TODO: move this part to the loop body - let (deltas, image) = - flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?; + let retention = self + .generate_key_retention( + last_key, + &accumulated_values, + gc_cutoff, + &retain_lsns_below_horizon, + COMPACTION_DELTA_THRESHOLD, + ) + .await?; // Put the image into the image layer. Currently we have a single big layer for the compaction. - image_layer_writer.put_image(last_key, image, ctx).await?; - delta_values.extend(deltas); + retention + .pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx) + .await?; delta_layers.extend( flush_deltas( &mut delta_values, @@ -1210,7 +1456,7 @@ impl Timeline { &delta_split_points, &mut current_delta_split_point, self, - gc_cutoff, + lowest_retain_lsn, ctx, ) .await?, diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index b0088f4ea2..ab6a5f20ba 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -148,14 +148,14 @@ async fn cleanup_remaining_timeline_fs_traces( /// For more context see comments in [`DeleteTimelineFlow::prepare`] async fn remove_timeline_from_tenant( tenant: &Tenant, - timeline_id: TimelineId, + timeline: &Timeline, _: &DeletionGuard, // using it as a witness ) -> anyhow::Result<()> { // Remove the timeline from the map. let mut timelines = tenant.timelines.lock().unwrap(); let children_exist = timelines .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id)); // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. // We already deleted the layer files, so it's probably best to panic. // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) @@ -164,7 +164,7 @@ async fn remove_timeline_from_tenant( } timelines - .remove(&timeline_id) + .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); drop(timelines); @@ -182,13 +182,15 @@ async fn remove_timeline_from_tenant( /// 5. Delete index part /// 6. Delete meta, timeline directory /// 7. Delete mark file +/// /// It is resumable from any step in case a crash/restart occurs. /// There are three entrypoints to the process: /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler. /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present -/// and we possibly neeed to continue deletion of remote files. +/// and we possibly neeed to continue deletion of remote files. /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote -/// index but still have local metadata, timeline directory and delete mark. +/// index but still have local metadata, timeline directory and delete mark. +/// /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load. #[derive(Default)] pub enum DeleteTimelineFlow { @@ -389,7 +391,6 @@ impl DeleteTimelineFlow { Some(tenant_shard_id), Some(timeline_id), "timeline_delete", - false, async move { if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await { error!("Error: {err:#}"); @@ -413,7 +414,7 @@ impl DeleteTimelineFlow { pausable_failpoint!("in_progress_delete"); - remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; + remove_timeline_from_tenant(tenant, timeline, &guard).await?; *guard = Self::Finished; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 4fc89330ba..49ce3db3e6 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -10,6 +10,7 @@ use crate::{ }, virtual_file::{MaybeFatalIo, VirtualFile}, }; +use pageserver_api::models::detach_ancestor::AncestorDetached; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; @@ -39,6 +40,9 @@ pub(crate) enum Error { #[error("unexpected error")] Unexpected(#[source] anyhow::Error), + + #[error("failpoint: {}", .0)] + Failpoint(&'static str), } impl From for ApiError { @@ -57,11 +61,41 @@ impl From for ApiError { | e @ Error::CopyDeltaPrefix(_) | e @ Error::UploadRewritten(_) | e @ Error::CopyFailed(_) - | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()), + | e @ Error::Unexpected(_) + | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()), } } } +impl From for Error { + fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self { + // treat all as shutting down signals, even though that is not entirely correct + // (uninitialized state) + Error::ShuttingDown + } +} + +impl From for Error { + fn from(value: FlushLayerError) -> Self { + match value { + FlushLayerError::Cancelled => Error::ShuttingDown, + FlushLayerError::NotRunning(_) => { + // FIXME(#6424): technically statically unreachable right now, given how we never + // drop the sender + Error::ShuttingDown + } + FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => { + Error::FlushAncestor(value) + } + } + } +} + +pub(crate) enum Progress { + Prepared(completion::Completion, PreparedTimelineDetach), + Done(AncestorDetached), +} + pub(crate) struct PreparedTimelineDetach { layers: Vec, } @@ -88,7 +122,7 @@ pub(super) async fn prepare( tenant: &Tenant, options: Options, ctx: &RequestContext, -) -> Result<(completion::Completion, PreparedTimelineDetach), Error> { +) -> Result { use Error::*; let Some((ancestor, ancestor_lsn)) = detached @@ -96,15 +130,67 @@ pub(super) async fn prepare( .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { - // TODO: check if we have already been detached; for this we need to read the stored data - // on remote client, for that we need a follow-up which makes uploads cheaper and maintains - // a projection of the commited data. + { + let accessor = detached.remote_client.initialized_upload_queue()?; + + // we are safe to inspect the latest uploaded, because we can only witness this after + // restart is complete and ancestor is no more. + let latest = accessor.latest_uploaded_index_part(); + if !latest.lineage.is_detached_from_original_ancestor() { + return Err(NoAncestor); + } + } + + // detached has previously been detached; let's inspect each of the current timelines and + // report back the timelines which have been reparented by our detach + let mut all_direct_children = tenant + .timelines + .lock() + .unwrap() + .values() + .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached))) + .map(|tl| (tl.ancestor_lsn, tl.clone())) + .collect::>(); + + let mut any_shutdown = false; + + all_direct_children.retain( + |(_, tl)| match tl.remote_client.initialized_upload_queue() { + Ok(accessor) => accessor + .latest_uploaded_index_part() + .lineage + .is_reparented(), + Err(_shutdownalike) => { + // not 100% a shutdown, but let's bail early not to give inconsistent results in + // sharded enviroment. + any_shutdown = true; + true + } + }, + ); + + if any_shutdown { + // it could be one or many being deleted; have client retry + return Err(Error::ShuttingDown); + } + + let mut reparented = all_direct_children; + // why this instead of hashset? there is a reason, but I've forgotten it many times. // - // the error is wrong per openapi - return Err(NoAncestor); + // maybe if this was a hashset we would not be able to distinguish some race condition. + reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id)); + + return Ok(Progress::Done(AncestorDetached { + reparented_timelines: reparented + .into_iter() + .map(|(_, tl)| tl.timeline_id) + .collect(), + })); }; if !ancestor_lsn.is_valid() { + // rare case, probably wouldn't even load + tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing"); return Err(NoAncestor); } @@ -131,6 +217,15 @@ pub(super) async fn prepare( let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable"); + + fail::fail_point!( + "timeline-detach-ancestor::before_starting_after_locking", + |_| Err(Error::Failpoint( + "timeline-detach-ancestor::before_starting_after_locking" + )) + ); + if ancestor_lsn >= ancestor.get_disk_consistent_lsn() { let span = tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id); @@ -151,7 +246,7 @@ pub(super) async fn prepare( } }; - res.map_err(FlushAncestor)?; + res?; // we do not need to wait for uploads to complete but we do need `struct Layer`, // copying delta prefix is unsupported currently for `InMemoryLayer`. @@ -159,7 +254,7 @@ pub(super) async fn prepare( elapsed_ms = started_at.elapsed().as_millis(), "froze and flushed the ancestor" ); - Ok(()) + Ok::<_, Error>(()) } .instrument(span) .await?; @@ -283,7 +378,7 @@ pub(super) async fn prepare( let prepared = PreparedTimelineDetach { layers: new_layers }; - Ok((guard, prepared)) + Ok(Progress::Prepared(guard, prepared)) } fn partition_work( @@ -350,7 +445,11 @@ async fn copy_lsn_prefix( target_timeline: &Arc, ctx: &RequestContext, ) -> Result, Error> { - use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed}; + use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown}; + + if target_timeline.cancel.is_cancelled() { + return Err(ShuttingDown); + } tracing::debug!(%layer, %end_lsn, "copying lsn prefix"); @@ -529,7 +628,7 @@ pub(super) async fn complete( match res { Ok(Some(timeline)) => { tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); - reparented.push(timeline.timeline_id); + reparented.push((timeline.ancestor_lsn, timeline.timeline_id)); } Ok(None) => { // lets just ignore this for now. one or all reparented timelines could had @@ -551,5 +650,12 @@ pub(super) async fn complete( tracing::info!("failed to reparent some candidates"); } + reparented.sort_unstable(); + + let reparented = reparented + .into_iter() + .map(|(_, timeline_id)| timeline_id) + .collect(); + Ok(reparented) } diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 8a8c38d0ce..fec66aabc1 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -65,7 +65,6 @@ impl Timeline { "layer eviction for {}/{}", self.tenant_shard_id, self.timeline_id ), - false, async move { tokio::select! { _ = self_clone.cancel.cancelled() => { return Ok(()); } @@ -226,7 +225,7 @@ impl Timeline { continue; } - let last_activity_ts = layer.access_stats().latest_activity_or_now(); + let last_activity_ts = layer.access_stats().latest_activity(); let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index a43ff873ac..1e4edd34ad 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -255,6 +255,14 @@ impl LayerManager { new_layer.layer_desc().lsn_range ); + // Transfer visibilty hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to + // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents + // always marking rewritten layers as visible. + new_layer + .as_ref() + .access_stats() + .set_visibility(old_layer.access_stats().visibility()); + // Safety: we may never rewrite the same file in-place. Callers are responsible // for ensuring that they only rewrite layers after something changes the path, // such as an increment in the generation number. diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index 8f9ca0e29f..b0d6c4a27a 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -11,11 +11,11 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; /// Calculation consists of two stages: /// /// 1. Initial size calculation. That might take a long time, because it requires -/// reading all layers containing relation sizes at `initial_part_end`. +/// reading all layers containing relation sizes at `initial_part_end`. /// /// 2. Collecting an incremental part and adding that to the initial size. -/// Increments are appended on walreceiver writing new timeline data, -/// which result in increase or decrease of the logical size. +/// Increments are appended on walreceiver writing new timeline data, +/// which result in increase or decrease of the logical size. pub(super) struct LogicalSize { /// Size, potentially slow to compute. Calculating this might require reading multiple /// layers, and even ancestor's layers. @@ -45,17 +45,17 @@ pub(super) struct LogicalSize { /// Size shouldn't ever be negative, but this is signed for two reasons: /// /// 1. If we initialized the "baseline" size lazily, while we already - /// process incoming WAL, the incoming WAL records could decrement the - /// variable and temporarily make it negative. (This is just future-proofing; - /// the initialization is currently not done lazily.) + /// process incoming WAL, the incoming WAL records could decrement the + /// variable and temporarily make it negative. (This is just future-proofing; + /// the initialization is currently not done lazily.) /// /// 2. If there is a bug and we e.g. forget to increment it in some cases - /// when size grows, but remember to decrement it when it shrinks again, the - /// variable could go negative. In that case, it seems better to at least - /// try to keep tracking it, rather than clamp or overflow it. Note that - /// get_current_logical_size() will clamp the returned value to zero if it's - /// negative, and log an error. Could set it permanently to zero or some - /// special value to indicate "broken" instead, but this will do for now. + /// when size grows, but remember to decrement it when it shrinks again, the + /// variable could go negative. In that case, it seems better to at least + /// try to keep tracking it, rather than clamp or overflow it. Note that + /// get_current_logical_size() will clamp the returned value to zero if it's + /// negative, and log an error. Could set it permanently to zero or some + /// special value to indicate "broken" instead, but this will do for now. /// /// Note that we also expose a copy of this value as a prometheus metric, /// see `current_logical_size_gauge`. Use the `update_current_logical_size` diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index a085154a5a..4a3a5c621b 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -2,13 +2,13 @@ //! To do so, a current implementation needs to do the following: //! //! * acknowledge the timelines that it needs to stream WAL into. -//! Pageserver is able to dynamically (un)load tenants on attach and detach, -//! hence WAL receiver needs to react on such events. +//! Pageserver is able to dynamically (un)load tenants on attach and detach, +//! hence WAL receiver needs to react on such events. //! //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming. -//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically. -//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. -//! Without this data, no WAL streaming is possible currently. +//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically. +//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. +//! Without this data, no WAL streaming is possible currently. //! //! Only one active WAL streaming connection is allowed at a time. //! The connection is supposed to be updated periodically, based on safekeeper timeline data. diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 1d2ffec08f..de50f217d8 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -1118,7 +1118,7 @@ mod tests { #[tokio::test] async fn no_connection_no_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("no_connection_no_candidate")?; + let harness = TenantHarness::create("no_connection_no_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1151,7 +1151,7 @@ mod tests { #[tokio::test] async fn connection_no_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("connection_no_candidate")?; + let harness = TenantHarness::create("connection_no_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1216,7 +1216,7 @@ mod tests { #[tokio::test] async fn no_connection_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("no_connection_candidate")?; + let harness = TenantHarness::create("no_connection_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1279,7 +1279,7 @@ mod tests { #[tokio::test] async fn candidate_with_many_connection_failures() -> anyhow::Result<()> { - let harness = TenantHarness::create("candidate_with_many_connection_failures")?; + let harness = TenantHarness::create("candidate_with_many_connection_failures").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1319,7 +1319,7 @@ mod tests { #[tokio::test] async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1385,7 +1385,8 @@ mod tests { #[tokio::test] async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?; + let harness = + TenantHarness::create("timeout_connection_threshold_current_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1448,7 +1449,7 @@ mod tests { #[tokio::test] async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?; + let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let new_lsn = Lsn(100_100).align(); @@ -1550,7 +1551,7 @@ mod tests { // and pageserver should prefer to connect to it. let test_az = Some("test_az".to_owned()); - let harness = TenantHarness::create("switch_to_same_availability_zone")?; + let harness = TenantHarness::create("switch_to_same_availability_zone").await?; let mut state = dummy_state(&harness).await; state.conf.availability_zone.clone_from(&test_az); let current_lsn = Lsn(100_000).align(); diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 50c977a950..f7440ecdae 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -228,18 +228,20 @@ impl UploadQueue { Ok(self.initialized_mut().expect("we just set it")) } - pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> { + pub(crate) fn initialized_mut( + &mut self, + ) -> Result<&mut UploadQueueInitialized, NotInitialized> { use UploadQueue::*; match self { - Uninitialized => Err(NotInitialized::Uninitialized.into()), + Uninitialized => Err(NotInitialized::Uninitialized), Initialized(x) => { if x.shutting_down { - Err(NotInitialized::ShuttingDown.into()) + Err(NotInitialized::ShuttingDown) } else { Ok(x) } } - Stopped(_) => Err(NotInitialized::Stopped.into()), + Stopped(_) => Err(NotInitialized::Stopped), } } diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 7ad8446e04..54a3ad789b 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -20,11 +20,13 @@ use std::num::NonZeroUsize; use bytes::BytesMut; use pageserver_api::key::Key; +use tokio::io::AsyncWriteExt; use tokio_epoll_uring::BoundedBuf; use utils::lsn::Lsn; use utils::vec_map::VecMap; use crate::context::RequestContext; +use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; use crate::virtual_file::VirtualFile; #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -68,7 +70,7 @@ impl VectoredRead { } } -#[derive(Eq, PartialEq)] +#[derive(Eq, PartialEq, Debug)] pub(crate) enum VectoredReadExtended { Yes, No, @@ -91,7 +93,7 @@ impl VectoredReadBuilder { start_offset: u64, end_offset: u64, meta: BlobMeta, - max_read_size: Option, + max_read_size: usize, ) -> Self { let mut blobs_at = VecMap::default(); blobs_at @@ -102,10 +104,9 @@ impl VectoredReadBuilder { start: start_offset, end: end_offset, blobs_at, - max_read_size, + max_read_size: Some(max_read_size), } } - /// Attempt to extend the current read with a new blob if the start /// offset matches with the current end of the vectored read /// and the resuting size is below the max read size @@ -164,7 +165,7 @@ pub struct VectoredReadPlanner { // Arguments for previous blob passed into [`VectoredReadPlanner::handle`] prev: Option<(Key, Lsn, u64, BlobFlag)>, - max_read_size: Option, + max_read_size: usize, } impl VectoredReadPlanner { @@ -172,20 +173,7 @@ impl VectoredReadPlanner { Self { blobs: BTreeMap::new(), prev: None, - max_read_size: Some(max_read_size), - } - } - - /// This function should *only* be used if the caller has a way to control the limit. e.g., in [`StreamingVectoredReadPlanner`], - /// it uses the vectored read planner to avoid duplicated logic on handling blob start/end, while expecting the vectored - /// read planner to give a single read to a continuous range of bytes in the image layer. Therefore, it does not need the - /// code path to split reads into chunks of `max_read_size`, and controls the read size itself. - #[cfg(test)] - pub(crate) fn new_caller_controlled_max_limit() -> Self { - Self { - blobs: BTreeMap::new(), - prev: None, - max_read_size: None, + max_read_size, } } @@ -203,9 +191,9 @@ impl VectoredReadPlanner { /// /// The `flag` argument has two interesting values: /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs. - /// This is used for WAL records that `will_init`. + /// This is used for WAL records that `will_init`. /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens - /// if the blob is cached. + /// if the blob is cached. pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) { // Implementation note: internally lag behind by one blob such that // we have a start and end offset when initialising [`VectoredRead`] @@ -315,7 +303,7 @@ impl<'a> VectoredBlobReader<'a> { read.size(), buf.capacity() ); - let buf = self + let mut buf = self .file .read_exact_at(buf.slice(0..read.size()), read.start, ctx) .await? @@ -337,38 +325,68 @@ impl<'a> VectoredBlobReader<'a> { .chain(std::iter::once(None)), ); + // Some scratch space, put here for reusing the allocation + let mut decompressed_vec = Vec::new(); + for ((offset, meta), next) in pairs { let offset_in_buf = offset - start_offset; let first_len_byte = buf[offset_in_buf as usize]; - // Each blob is prefixed by a header containing it's size. + // Each blob is prefixed by a header containing its size and compression information. // Extract the size and skip that header to find the start of the data. // The size can be 1 or 4 bytes. The most significant bit is 0 in the // 1 byte case and 1 in the 4 byte case. - let (size_length, blob_size) = if first_len_byte < 0x80 { - (1, first_len_byte as u64) + let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 { + (1, first_len_byte as u64, BYTE_UNCOMPRESSED) } else { let mut blob_size_buf = [0u8; 4]; let offset_in_buf = offset_in_buf as usize; blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]); - blob_size_buf[0] &= 0x7f; - (4, u32::from_be_bytes(blob_size_buf) as u64) + blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK; + + let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK; + ( + 4, + u32::from_be_bytes(blob_size_buf) as u64, + compression_bits, + ) }; - let start = offset_in_buf + size_length; - let end = match next { + let start_raw = offset_in_buf + size_length; + let end_raw = match next { Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset, - None => start + blob_size, + None => start_raw + blob_size, }; - - assert_eq!(end - start, blob_size); + assert_eq!(end_raw - start_raw, blob_size); + let (start, end); + if compression_bits == BYTE_UNCOMPRESSED { + start = start_raw as usize; + end = end_raw as usize; + } else if compression_bits == BYTE_ZSTD { + let mut decoder = + async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec); + decoder + .write_all(&buf[start_raw as usize..end_raw as usize]) + .await?; + decoder.flush().await?; + start = buf.len(); + buf.extend_from_slice(&decompressed_vec); + end = buf.len(); + decompressed_vec.clear(); + } else { + let error = std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("invalid compression byte {compression_bits:x}"), + ); + return Err(error); + } metas.push(VectoredBlob { - start: start as usize, - end: end as usize, + start, + end, meta: *meta, - }) + }); } Ok(VectoredBlobsBuf { buf, blobs: metas }) @@ -376,88 +394,120 @@ impl<'a> VectoredBlobReader<'a> { } /// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for -/// getting read blobs. It returns a batch when `handle` gets called and when the current key would exceed the read_size and -/// max_cnt constraints. Underlying it uses [`VectoredReadPlanner`]. -#[cfg(test)] +/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and +/// max_cnt constraints. pub struct StreamingVectoredReadPlanner { - planner: VectoredReadPlanner, - /// Max read size per batch + read_builder: Option, + // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`] + prev: Option<(Key, Lsn, u64)>, + /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150, + /// we will produce a single batch instead of split them. max_read_size: u64, /// Max item count per batch max_cnt: usize, - /// The first offset of this batch - this_batch_first_offset: Option, /// Size of the current batch cnt: usize, } -#[cfg(test)] impl StreamingVectoredReadPlanner { pub fn new(max_read_size: u64, max_cnt: usize) -> Self { assert!(max_cnt > 0); assert!(max_read_size > 0); Self { - // We want to have exactly one read syscall (plus several others for index lookup) for each `next_batch` call. - // Therefore, we enforce `self.max_read_size` by ourselves instead of using the VectoredReadPlanner's capability, - // to avoid splitting into two I/Os. - planner: VectoredReadPlanner::new_caller_controlled_max_limit(), + read_builder: None, + prev: None, max_cnt, max_read_size, - this_batch_first_offset: None, cnt: 0, } } - fn emit(&mut self, this_batch_first_offset: u64) -> VectoredRead { - let planner = std::mem::replace( - &mut self.planner, - VectoredReadPlanner::new_caller_controlled_max_limit(), - ); - self.this_batch_first_offset = Some(this_batch_first_offset); - self.cnt = 1; - let mut batch = planner.finish(); - assert_eq!(batch.len(), 1, "should have exactly one read batch"); - batch.pop().unwrap() + pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option { + // Implementation note: internally lag behind by one blob such that + // we have a start and end offset when initialising [`VectoredRead`] + let (prev_key, prev_lsn, prev_offset) = match self.prev { + None => { + self.prev = Some((key, lsn, offset)); + return None; + } + Some(prev) => prev, + }; + + let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false); + + self.prev = Some((key, lsn, offset)); + + res } - pub fn handle( + pub fn handle_range_end(&mut self, offset: u64) -> Option { + let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev { + self.add_blob(prev_key, prev_lsn, prev_offset, offset, true) + } else { + None + }; + + self.prev = None; + + res + } + + fn add_blob( &mut self, key: Key, lsn: Lsn, - offset: u64, - flag: BlobFlag, + start_offset: u64, + end_offset: u64, + is_last_blob_in_read: bool, ) -> Option { - if let Some(begin_offset) = self.this_batch_first_offset { - // Each batch will have at least one item b/c `self.this_batch_first_offset` is set - // after one item gets processed - if offset - begin_offset > self.max_read_size { - self.planner.handle_range_end(offset); // End the current batch with the offset - let batch = self.emit(offset); // Produce a batch - self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch - return Some(batch); + match &mut self.read_builder { + Some(read_builder) => { + let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }); + assert_eq!(extended, VectoredReadExtended::Yes); } - } else { - self.this_batch_first_offset = Some(offset) - } - if self.cnt >= self.max_cnt { - self.planner.handle_range_end(offset); // End the current batch with the offset - let batch = self.emit(offset); // Produce a batch - self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch - return Some(batch); - } - self.planner.handle(key, lsn, offset, flag); // Add this key to the current batch - self.cnt += 1; - None - } + None => { + self.read_builder = { + let mut blobs_at = VecMap::default(); + blobs_at + .append(start_offset, BlobMeta { key, lsn }) + .expect("First insertion always succeeds"); - pub fn handle_range_end(&mut self, offset: u64) -> VectoredRead { - self.planner.handle_range_end(offset); - self.emit(offset) + Some(VectoredReadBuilder { + start: start_offset, + end: end_offset, + blobs_at, + max_read_size: None, + }) + }; + } + } + let read_builder = self.read_builder.as_mut().unwrap(); + self.cnt += 1; + if is_last_blob_in_read + || read_builder.size() >= self.max_read_size as usize + || self.cnt >= self.max_cnt + { + let prev_read_builder = self.read_builder.take(); + self.cnt = 0; + + // `current_read_builder` is None in the first iteration + if let Some(read_builder) = prev_read_builder { + return Some(read_builder.build()); + } + } + None } } #[cfg(test)] mod tests { + use anyhow::Error; + + use crate::context::DownloadBehavior; + use crate::page_cache::PAGE_SZ; + use crate::task_mgr::TaskKind; + + use super::super::blob_io::tests::{random_array, write_maybe_compressed}; use super::*; fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { @@ -509,8 +559,11 @@ mod tests { planner.handle_range_end(652 * 1024); let reads = planner.finish(); + assert_eq!(reads.len(), 6); + // TODO: could remove zero reads to produce 5 reads here + for (idx, read) in reads.iter().enumerate() { validate_read(read, ranges[idx]); } @@ -548,4 +601,187 @@ mod tests { validate_read(read, ranges[idx]); } } + + #[test] + fn streaming_planner_max_read_size_test() { + let max_read_size = 128 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 32 * 1024, BlobFlag::None), + (key, lsn, 96 * 1024, BlobFlag::None), + (key, lsn, 128 * 1024, BlobFlag::None), + (key, lsn, 198 * 1024, BlobFlag::None), + (key, lsn, 268 * 1024, BlobFlag::None), + (key, lsn, 396 * 1024, BlobFlag::None), + (key, lsn, 652 * 1024, BlobFlag::None), + ]; + + let ranges = [ + &blob_descriptions[0..3], + &blob_descriptions[3..5], + &blob_descriptions[5..6], + &blob_descriptions[6..7], + &blob_descriptions[7..], + ]; + + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000); + let mut reads = Vec::new(); + for (key, lsn, offset, _) in blob_descriptions.clone() { + reads.extend(planner.handle(key, lsn, offset)); + } + reads.extend(planner.handle_range_end(652 * 1024)); + + assert_eq!(reads.len(), ranges.len()); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn streaming_planner_max_cnt_test() { + let max_read_size = 1024 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 32 * 1024, BlobFlag::None), + (key, lsn, 96 * 1024, BlobFlag::None), + (key, lsn, 128 * 1024, BlobFlag::None), + (key, lsn, 198 * 1024, BlobFlag::None), + (key, lsn, 268 * 1024, BlobFlag::None), + (key, lsn, 396 * 1024, BlobFlag::None), + (key, lsn, 652 * 1024, BlobFlag::None), + ]; + + let ranges = [ + &blob_descriptions[0..2], + &blob_descriptions[2..4], + &blob_descriptions[4..6], + &blob_descriptions[6..], + ]; + + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); + let mut reads = Vec::new(); + for (key, lsn, offset, _) in blob_descriptions.clone() { + reads.extend(planner.handle(key, lsn, offset)); + } + reads.extend(planner.handle_range_end(652 * 1024)); + + assert_eq!(reads.len(), ranges.len()); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn streaming_planner_edge_test() { + let max_read_size = 1024 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); + let mut reads = Vec::new(); + reads.extend(planner.handle_range_end(652 * 1024)); + assert!(reads.is_empty()); + } + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); + let mut reads = Vec::new(); + reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle_range_end(652 * 1024)); + assert_eq!(reads.len(), 1); + validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); + } + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); + let mut reads = Vec::new(); + reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle_range_end(652 * 1024)); + assert_eq!(reads.len(), 2); + validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); + validate_read(&reads[1], &[(key, lsn, 128 * 1024, BlobFlag::None)]); + } + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); + let mut reads = Vec::new(); + reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle_range_end(652 * 1024)); + assert_eq!(reads.len(), 1); + validate_read( + &reads[0], + &[ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 128 * 1024, BlobFlag::None), + ], + ); + } + } + + async fn round_trip_test_compressed(blobs: &[Vec], compression: bool) -> Result<(), Error> { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let (_temp_dir, pathbuf, offsets) = + write_maybe_compressed::(blobs, compression, &ctx).await?; + + let file = VirtualFile::open(&pathbuf, &ctx).await?; + let file_len = std::fs::metadata(&pathbuf)?.len(); + + // Multiply by two (compressed data might need more space), and add a few bytes for the header + let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; + let mut buf = BytesMut::with_capacity(reserved_bytes); + + let vectored_blob_reader = VectoredBlobReader::new(&file); + let meta = BlobMeta { + key: Key::MIN, + lsn: Lsn(0), + }; + + for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { + let end = offsets.get(idx + 1).unwrap_or(&file_len); + if idx + 1 == offsets.len() { + continue; + } + let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096); + let read = read_builder.build(); + let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?; + assert_eq!(result.blobs.len(), 1); + let read_blob = &result.blobs[0]; + let read_buf = &result.buf[read_blob.start..read_blob.end]; + assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}"); + buf = result.buf; + } + Ok(()) + } + + #[tokio::test] + async fn test_really_big_array() -> Result<(), Error> { + let blobs = &[ + b"test".to_vec(), + random_array(10 * PAGE_SZ), + b"hello".to_vec(), + random_array(66 * PAGE_SZ), + vec![0xf3; 24 * PAGE_SZ], + b"foobar".to_vec(), + ]; + round_trip_test_compressed(blobs, false).await?; + round_trip_test_compressed(blobs, true).await?; + Ok(()) + } + + #[tokio::test] + async fn test_arrays_inc() -> Result<(), Error> { + let blobs = (0..PAGE_SZ / 8) + .map(|v| random_array(v * 16)) + .collect::>(); + round_trip_test_compressed(&blobs, false).await?; + round_trip_test_compressed(&blobs, true).await?; + Ok(()) + } } diff --git a/pageserver/src/trace.rs b/pageserver/src/trace.rs deleted file mode 100644 index 18ec269198..0000000000 --- a/pageserver/src/trace.rs +++ /dev/null @@ -1,36 +0,0 @@ -use bytes::Bytes; -use camino::Utf8PathBuf; -use std::{ - fs::{create_dir_all, File}, - io::{BufWriter, Write}, -}; - -pub struct Tracer { - writer: BufWriter, -} - -impl Drop for Tracer { - fn drop(&mut self) { - self.flush() - } -} - -impl Tracer { - pub fn new(path: Utf8PathBuf) -> Self { - let parent = path.parent().expect("failed to parse parent path"); - create_dir_all(parent).expect("failed to create trace dir"); - - let file = File::create(path).expect("failed to create trace file"); - Tracer { - writer: BufWriter::new(file), - } - } - - pub fn trace(&mut self, msg: &Bytes) { - self.writer.write_all(msg).expect("failed to write trace"); - } - - pub fn flush(&mut self) { - self.writer.flush().expect("failed to flush trace file"); - } -} diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 885a9221c5..8599d95cdf 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -33,6 +33,7 @@ pub struct BufferedWriter { /// invariant: always remains Some(buf) except /// - while IO is ongoing => goes back to Some() once the IO completed successfully /// - after an IO error => stays `None` forever + /// /// In these exceptional cases, it's `None`. buf: Option, } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 07c90385e6..dff3a8f52d 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1754,7 +1754,7 @@ mod tests { #[tokio::test] async fn test_relsize() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1975,7 +1975,10 @@ mod tests { // and then created it again within the same layer. #[tokio::test] async fn test_drop_extend() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_drop_extend") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -2046,7 +2049,10 @@ mod tests { // and then extended it again within the same layer. #[tokio::test] async fn test_truncate_extend() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_truncate_extend") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -2188,7 +2194,7 @@ mod tests { /// split into multiple 1 GB segments in Postgres. #[tokio::test] async fn test_large_rel() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_large_rel").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -2296,7 +2302,7 @@ mod tests { let startpoint = Lsn::from_hex("14AEC08").unwrap(); let _endpoint = Lsn::from_hex("1FFFF98").unwrap(); - let harness = TenantHarness::create("test_ingest_real_wal").unwrap(); + let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap(); let (tenant, ctx) = harness.load().await; let remote_initdb_path = diff --git a/patches/rum.patch b/patches/rum.patch new file mode 100644 index 0000000000..3041f8df81 --- /dev/null +++ b/patches/rum.patch @@ -0,0 +1,54 @@ +commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb +Author: Anastasia Lubennikova +Date: Mon Jul 15 12:31:56 2024 +0100 + + Neon: fix unlogged index build patch + +diff --git a/src/ruminsert.c b/src/ruminsert.c +index e8b209d..e89bf2a 100644 +--- a/src/ruminsert.c ++++ b/src/ruminsert.c +@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(index->rd_smgr); ++#endif ++ + initRumState(&buildstate.rumstate, index); + buildstate.rumstate.isBuild = true; + buildstate.indtuples = 0; +@@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); + rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(index->rd_smgr); ++#endif ++ + /* + * Write index to xlog + */ +@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + UnlockReleaseBuffer(buffer); + } + ++#ifdef NEON_SMGR ++ { ++#if PG_VERSION_NUM >= 160000 ++ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator; ++#else ++ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; ++#endif ++ ++ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ ++ smgr_end_unlogged_build(index->rd_smgr); ++ } ++#endif ++ + /* + * Return statistics + */ diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index e4968bdf89..d107cdc1c2 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -46,6 +46,21 @@ void _PG_init(void); static int logical_replication_max_snap_files = 300; +static int running_xacts_overflow_policy; + +enum RunningXactsOverflowPolicies { + OP_IGNORE, + OP_SKIP, + OP_WAIT +}; + +static const struct config_enum_entry running_xacts_overflow_policies[] = { + {"ignore", OP_IGNORE, false}, + {"skip", OP_SKIP, false}, + {"wait", OP_WAIT, false}, + {NULL, 0, false} +}; + static void InitLogicalReplicationMonitor(void) { @@ -414,6 +429,7 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId)); n_restored_xids = 0; next_prepared_idx = 0; + for (TransactionId xid = from; xid != till;) { XLogRecPtr xidlsn; @@ -424,7 +440,7 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n /* * "Merge" the prepared transactions into the restored_xids array as * we go. The prepared transactions array is sorted. This is mostly - * a sanity check to ensure that all the prpeared transactions are + * a sanity check to ensure that all the prepared transactions are * seen as in-progress. (There is a check after the loop that we didn't * miss any.) */ @@ -522,14 +538,23 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u", checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid)); - goto fail; + + switch (running_xacts_overflow_policy) + { + case OP_WAIT: + goto fail; + case OP_IGNORE: + goto success; + case OP_SKIP: + n_restored_xids = 0; + goto success; + } } restored_xids[n_restored_xids++] = xid; skip: TransactionIdAdvance(xid); - continue; } /* sanity check */ @@ -540,11 +565,13 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n Assert(false); goto fail; } - + success: elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u", n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid)); *nxids = n_restored_xids; *xids = restored_xids; + if (prepared_xids) + pfree(prepared_xids); return true; fail: @@ -570,7 +597,7 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); - WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitLogicalReplicationMonitor(); @@ -581,6 +608,18 @@ _PG_init(void) restore_running_xacts_callback = RestoreRunningXactsFromClog; + + DefineCustomEnumVariable( + "neon.running_xacts_overflow_policy", + "Action performed on snapshot overflow when restoring runnings xacts from CLOG", + NULL, + &running_xacts_overflow_policy, + OP_IGNORE, + running_xacts_overflow_policies, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control index cee2f336f2..03bdb9a0b4 100644 --- a/pgxn/neon/neon.control +++ b/pgxn/neon/neon.control @@ -1,6 +1,6 @@ # neon extension comment = 'cloud storage for PostgreSQL' -default_version = '1.3' +default_version = '1.4' module_pathname = '$libdir/neon' relocatable = true trusted = true diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index 60eb8e1fc9..b575712dbe 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -109,11 +109,12 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_ { NeonWALReader *reader; + /* + * Note: we allocate in TopMemoryContext, reusing the reader for all process + * reads. + */ reader = (NeonWALReader *) - palloc_extended(sizeof(NeonWALReader), - MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); - if (!reader) - return NULL; + MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader)); reader->available_lsn = available_lsn; reader->seg.ws_file = -1; @@ -219,7 +220,8 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou return NEON_WALREAD_ERROR; } /* we'll poll immediately */ - state->rem_state = RS_CONNECTING_READ; + state->rem_state = RS_CONNECTING_WRITE; + return NEON_WALREAD_WOULDBLOCK; } if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE) diff --git a/pgxn/neon_test_utils/neon_test_utils--1.3.sql b/pgxn/neon_test_utils/neon_test_utils--1.3.sql index 3b8794a8cf..9a9b41c3a3 100644 --- a/pgxn/neon_test_utils/neon_test_utils--1.3.sql +++ b/pgxn/neon_test_utils/neon_test_utils--1.3.sql @@ -7,6 +7,12 @@ AS 'MODULE_PATHNAME', 'test_consume_xids' LANGUAGE C STRICT PARALLEL UNSAFE; +CREATE FUNCTION test_consume_oids(oid int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_oids' +LANGUAGE C STRICT +PARALLEL UNSAFE; + CREATE FUNCTION test_consume_cpu(seconds int) RETURNS VOID AS 'MODULE_PATHNAME', 'test_consume_cpu' diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 650ef7405d..0b5499ca53 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -35,6 +35,7 @@ PG_MODULE_MAGIC; extern void _PG_init(void); PG_FUNCTION_INFO_V1(test_consume_xids); +PG_FUNCTION_INFO_V1(test_consume_oids); PG_FUNCTION_INFO_V1(test_consume_cpu); PG_FUNCTION_INFO_V1(test_consume_memory); PG_FUNCTION_INFO_V1(test_release_memory); @@ -74,6 +75,21 @@ _PG_init(void) #define neon_read_at_lsn neon_read_at_lsn_ptr +/* + * test_consume_oids(int4), for rapidly consuming OIDs, to test wraparound. + * Unlike test_consume_xids which is passed number of xids to be consumed, + * this function is given the target Oid. + */ +Datum +test_consume_oids(PG_FUNCTION_ARGS) +{ + int32 oid = PG_GETARG_INT32(0); + + while (oid != GetNewObjectId()); + + PG_RETURN_VOID(); +} + /* * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. */ diff --git a/poetry.lock b/poetry.lock index 8091141411..5192a574cc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2641,19 +2641,18 @@ pbr = "*" [[package]] name = "setuptools" -version = "65.5.1" +version = "70.0.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"}, - {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"}, + {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, + {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 288f7769fe..2f18b5fbc6 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true +typed-json.workspace = true url.workspace = true urlencoding.workspace = true utils.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index f757a15fbb..67c4dd019e 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -717,8 +717,10 @@ mod tests { _ => panic!("wrong message"), } }); - let endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); let _creds = auth_quirks( &mut ctx, @@ -767,8 +769,10 @@ mod tests { frontend::password_message(b"my-secret-password", &mut write).unwrap(); client.write_all(&write).await.unwrap(); }); - let endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); let _creds = auth_quirks( &mut ctx, @@ -818,8 +822,10 @@ mod tests { client.write_all(&write).await.unwrap(); }); - let endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); let creds = auth_quirks( &mut ctx, diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index ece1af1ac4..696940728a 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -22,7 +22,9 @@ use proxy::http; use proxy::http::health_server::AppMetrics; use proxy::metrics::Metrics; use proxy::rate_limiter::EndpointRateLimiter; +use proxy::rate_limiter::LeakyBucketConfig; use proxy::rate_limiter::RateBucketInfo; +use proxy::rate_limiter::WakeComputeRateLimiter; use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use proxy::redis::elasticache; @@ -176,6 +178,9 @@ struct ProxyCliArgs { /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) #[clap(long)] redis_notifications: Option, + /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". + #[clap(long, default_value = "irsa")] + redis_auth_type: String, /// redis host for streaming connections (might be different from the notifications host) #[clap(long)] redis_host: Option, @@ -319,24 +324,38 @@ async fn main() -> anyhow::Result<()> { ), aws_credentials_provider, )); - let regional_redis_client = match (args.redis_host, args.redis_port) { - (Some(host), Some(port)) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host, - port, - elasticache_credentials_provider.clone(), + let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { + ("plain", redis_url) => match redis_url { + None => { + bail!("plain auth requires redis_notifications to be set"); + } + Some(url) => Some( + ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()), ), - ), - (None, None) => { - warn!("Redis events from console are disabled"); - None - } + }, + ("irsa", _) => match (&args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host.to_string(), + port, + elasticache_credentials_provider.clone(), + ), + ), + (None, None) => { + warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }, _ => { - bail!("redis-host and redis-port must be specified together"); + bail!("unknown auth type given"); } }; + let redis_notifications_client = if let Some(url) = args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string())) } else { regional_redis_client.clone() }; @@ -373,9 +392,24 @@ async fn main() -> anyhow::Result<()> { proxy::metrics::CancellationSource::FromClient, )); - let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); - RateBucketInfo::validate(&mut endpoint_rps_limit)?; - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); + // bit of a hack - find the min rps and max rps supported and turn it into + // leaky bucket config instead + let max = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .max_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.max); + let rps = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .min_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.rps); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { rps, max }, + 64, + )); // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) @@ -577,7 +611,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); RateBucketInfo::validate(&mut wake_compute_rps_limit)?; let wake_compute_endpoint_rate_limiter = - Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit)); + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); let api = console::provider::neon::Api::new( endpoint, caches, diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index feb09d5638..de04733eb9 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -307,7 +307,7 @@ impl ConnCfg { let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); let (client, connection) = self.0.connect_raw(stream, tls).await?; drop(pause); - tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); + tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); let stream = connection.stream.into_inner(); info!( diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index a6e67be22f..768cd2fdfa 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -12,7 +12,7 @@ use crate::{ console::messages::{ColdStartInfo, Reason}, http, metrics::{CacheOutcome, Metrics}, - rate_limiter::EndpointRateLimiter, + rate_limiter::WakeComputeRateLimiter, scram, EndpointCacheKey, }; use crate::{cache::Cached, context::RequestMonitoring}; @@ -26,7 +26,7 @@ pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, pub locks: &'static ApiLocks, - pub wake_compute_endpoint_rate_limiter: Arc, + pub wake_compute_endpoint_rate_limiter: Arc, jwt: String, } @@ -36,7 +36,7 @@ impl Api { endpoint: http::Endpoint, caches: &'static ApiCaches, locks: &'static ApiLocks, - wake_compute_endpoint_rate_limiter: Arc, + wake_compute_endpoint_rate_limiter: Arc, ) -> Self { let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { Ok(v) => v, diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index cfc1f8e89e..543a458274 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -181,8 +181,9 @@ pub async fn worker( let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); let rx = rx.map(RequestData::from); - let storage = - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?; + let storage = GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?; let properties = WriterProperties::builder() .set_data_page_size_limit(config.parquet_upload_page_size) @@ -217,6 +218,7 @@ pub async fn worker( let storage_disconnect = GenericRemoteStorage::from_config(&disconnect_events_storage_config) + .await .context("remote storage for disconnect events init")?; let parquet_config_disconnect = parquet_config.clone(); tokio::try_join!( @@ -545,7 +547,9 @@ mod tests { }, timeout: std::time::Duration::from_secs(120), }; - let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap(); + let storage = GenericRemoteStorage::from_config(&remote_storage_config) + .await + .unwrap(); worker_inner(storage, rx, config).await.unwrap(); diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index be9072dd8c..222cd431d2 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -3,4 +3,8 @@ mod limiter; pub use limit_algorithm::{ aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; -pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo}; +pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; +mod leaky_bucket; +pub use leaky_bucket::{ + EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState, +}; diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs new file mode 100644 index 0000000000..2d5e056540 --- /dev/null +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -0,0 +1,171 @@ +use std::{ + hash::Hash, + sync::atomic::{AtomicUsize, Ordering}, +}; + +use ahash::RandomState; +use dashmap::DashMap; +use rand::{thread_rng, Rng}; +use tokio::time::Instant; +use tracing::info; + +use crate::intern::EndpointIdInt; + +// Simple per-endpoint rate limiter. +pub type EndpointRateLimiter = LeakyBucketRateLimiter; + +pub struct LeakyBucketRateLimiter { + map: DashMap, + config: LeakyBucketConfig, + access_count: AtomicUsize, +} + +impl LeakyBucketRateLimiter { + pub const DEFAULT: LeakyBucketConfig = LeakyBucketConfig { + rps: 600.0, + max: 1500.0, + }; + + pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self { + Self { + map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards), + config, + access_count: AtomicUsize::new(0), + } + } + + /// Check that number of connections to the endpoint is below `max_rps` rps. + pub fn check(&self, key: K, n: u32) -> bool { + let now = Instant::now(); + + if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 { + self.do_gc(now); + } + + let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState { + time: now, + filled: 0.0, + }); + + entry.check(&self.config, now, n as f64) + } + + fn do_gc(&self, now: Instant) { + info!( + "cleaning up bucket rate limiter, current size = {}", + self.map.len() + ); + let n = self.map.shards().len(); + let shard = thread_rng().gen_range(0..n); + self.map.shards()[shard] + .write() + .retain(|_, value| !value.get_mut().update(&self.config, now)); + } +} + +pub struct LeakyBucketConfig { + pub rps: f64, + pub max: f64, +} + +pub struct LeakyBucketState { + filled: f64, + time: Instant, +} + +impl LeakyBucketConfig { + pub fn new(rps: f64, max: f64) -> Self { + assert!(rps > 0.0, "rps must be positive"); + assert!(max > 0.0, "max must be positive"); + Self { rps, max } + } +} + +impl LeakyBucketState { + pub fn new() -> Self { + Self { + filled: 0.0, + time: Instant::now(), + } + } + + /// updates the timer and returns true if the bucket is empty + fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool { + let drain = now.duration_since(self.time); + let drain = drain.as_secs_f64() * info.rps; + + self.filled = (self.filled - drain).clamp(0.0, info.max); + self.time = now; + + self.filled == 0.0 + } + + pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool { + self.update(info, now); + + if self.filled + n > info.max { + return false; + } + self.filled += n; + + true + } +} + +impl Default for LeakyBucketState { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use tokio::time::Instant; + + use super::{LeakyBucketConfig, LeakyBucketState}; + + #[tokio::test(start_paused = true)] + async fn check() { + let info = LeakyBucketConfig::new(500.0, 2000.0); + let mut bucket = LeakyBucketState::new(); + + // should work for 2000 requests this second + for _ in 0..2000 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + assert!(!bucket.check(&info, Instant::now(), 1.0)); + assert_eq!(bucket.filled, 2000.0); + + // in 1ms we should drain 0.5 tokens. + // make sure we don't lose any tokens + tokio::time::advance(Duration::from_millis(1)).await; + assert!(!bucket.check(&info, Instant::now(), 1.0)); + tokio::time::advance(Duration::from_millis(1)).await; + assert!(bucket.check(&info, Instant::now(), 1.0)); + + // in 10ms we should drain 5 tokens + tokio::time::advance(Duration::from_millis(10)).await; + for _ in 0..5 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + assert!(!bucket.check(&info, Instant::now(), 1.0)); + + // in 10s we should drain 5000 tokens + // but cap is only 2000 + tokio::time::advance(Duration::from_secs(10)).await; + for _ in 0..2000 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + assert!(!bucket.check(&info, Instant::now(), 1.0)); + + // should sustain 500rps + for _ in 0..2000 { + tokio::time::advance(Duration::from_millis(10)).await; + for _ in 0..5 { + assert!(bucket.check(&info, Instant::now(), 1.0)); + } + } + } +} diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index b8c9490696..5db4efed37 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -61,7 +61,7 @@ impl GlobalRateLimiter { // Purposefully ignore user name and database name as clients can reconnect // with different names, so we'll end up sending some http requests to // the control plane. -pub type EndpointRateLimiter = BucketRateLimiter; +pub type WakeComputeRateLimiter = BucketRateLimiter; pub struct BucketRateLimiter { map: DashMap, Hasher>, @@ -103,7 +103,7 @@ pub struct RateBucketInfo { impl std::fmt::Display for RateBucketInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64; + let rps = self.rps().floor() as u64; write!(f, "{rps}@{}", humantime::format_duration(self.interval)) } } @@ -140,6 +140,10 @@ impl RateBucketInfo { Self::new(200, Duration::from_secs(600)), ]; + pub fn rps(&self) -> f64 { + (self.max_rpi as f64) / self.interval.as_secs_f64() + } + pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info @@ -245,7 +249,7 @@ mod tests { use rustc_hash::FxHasher; use tokio::time; - use super::{BucketRateLimiter, EndpointRateLimiter}; + use super::{BucketRateLimiter, WakeComputeRateLimiter}; use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId}; #[test] @@ -293,7 +297,7 @@ mod tests { .map(|s| s.parse().unwrap()) .collect(); RateBucketInfo::validate(&mut rates).unwrap(); - let limiter = EndpointRateLimiter::new(rates); + let limiter = WakeComputeRateLimiter::new(rates); let endpoint = EndpointId::from("ep-my-endpoint-1234"); let endpoint = EndpointIdInt::from(endpoint); diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 7baf104374..c9a946fa4a 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -106,7 +106,7 @@ impl RedisPublisherClient { cancel_key_data, session_id, }))?; - self.client.publish(PROXY_CHANNEL_NAME, payload).await?; + let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?; Ok(()) } pub async fn try_connect(&mut self) -> anyhow::Result<()> { diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 3a90d911c2..b02ce472c0 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -178,7 +178,7 @@ impl ConnectionWithCredentialsProvider { credentials_provider: Arc, ) -> anyhow::Result<()> { let (user, password) = credentials_provider.provide_credentials().await?; - redis::cmd("AUTH") + let _: () = redis::cmd("AUTH") .arg(user) .arg(password) .query_async(con) diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 87d723d17e..efd7437d5d 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -127,7 +127,7 @@ impl MessageHandler { Cancel(cancel_session) => { tracing::Span::current().record( "session_id", - &tracing::field::display(cancel_session.session_id), + tracing::field::display(cancel_session.session_id), ); Metrics::get() .proxy diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index c73ea8646a..2e9bb7d1ed 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -241,7 +241,7 @@ impl ConnectMechanism for TokioMechanism { drop(pause); let (client, connection) = permit.release_result(res)?; - tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); + tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); Ok(poll_client( self.pool.clone(), ctx, diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 170bda062e..dbc58d48ec 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -403,7 +403,7 @@ impl GlobalConnPool { tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); tracing::Span::current().record( "pid", - &tracing::field::display(client.inner.get_process_id()), + tracing::field::display(client.inner.get_process_id()), ); info!( cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 9c4c7a1a7a..c7fbfddf5e 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -18,7 +18,7 @@ use hyper1::Response; use hyper1::StatusCode; use hyper1::{HeaderMap, Request}; use pq_proto::StartupMessageParamsBuilder; -use serde_json::json; +use serde::Serialize; use serde_json::Value; use tokio::time; use tokio_postgres::error::DbError; @@ -32,6 +32,7 @@ use tokio_postgres::Transaction; use tokio_util::sync::CancellationToken; use tracing::error; use tracing::info; +use typed_json::json; use url::Url; use utils::http::error::ApiError; @@ -262,13 +263,8 @@ pub async fn handle( | SqlOverHttpError::Postgres(e) => e.as_db_error(), _ => None, }; - fn get<'a, T: serde::Serialize>( - db: Option<&'a DbError>, - x: impl FnOnce(&'a DbError) -> T, - ) -> Value { - db.map(x) - .and_then(|t| serde_json::to_value(t).ok()) - .unwrap_or_default() + fn get<'a, T: Default>(db: Option<&'a DbError>, x: impl FnOnce(&'a DbError) -> T) -> T { + db.map(x).unwrap_or_default() } if let Some(db_error) = db_error { @@ -277,17 +273,11 @@ pub async fn handle( let position = db_error.and_then(|db| db.position()); let (position, internal_position, internal_query) = match position { - Some(ErrorPosition::Original(position)) => ( - Value::String(position.to_string()), - Value::Null, - Value::Null, - ), - Some(ErrorPosition::Internal { position, query }) => ( - Value::Null, - Value::String(position.to_string()), - Value::String(query.clone()), - ), - None => (Value::Null, Value::Null, Value::Null), + Some(ErrorPosition::Original(position)) => (Some(position.to_string()), None, None), + Some(ErrorPosition::Internal { position, query }) => { + (None, Some(position.to_string()), Some(query.clone())) + } + None => (None, None, None), }; let code = get(db_error, |db| db.code().code()); @@ -577,10 +567,8 @@ async fn handle_inner( .status(StatusCode::OK) .header(header::CONTENT_TYPE, "application/json"); - // - // Now execute the query and return the result - // - let result = match payload { + // Now execute the query and return the result. + let json_output = match payload { Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?, Payload::Batch(statements) => { if parsed_headers.txn_read_only { @@ -604,11 +592,9 @@ async fn handle_inner( let metrics = client.metrics(); - // how could this possibly fail - let body = serde_json::to_string(&result).expect("json serialization should not fail"); - let len = body.len(); + let len = json_output.len(); let response = response - .body(Full::new(Bytes::from(body))) + .body(Full::new(Bytes::from(json_output))) // only fails if invalid status code or invalid header/values are given. // these are not user configurable so it cannot fail dynamically .expect("building response payload should not fail"); @@ -630,7 +616,7 @@ impl QueryData { cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, - ) -> Result { + ) -> Result { let (inner, mut discard) = client.inner(); let cancel_token = inner.cancel_token(); @@ -643,7 +629,10 @@ impl QueryData { // The query successfully completed. Either::Left((Ok((status, results)), __not_yet_cancelled)) => { discard.check_idle(status); - Ok(results) + + let json_output = + serde_json::to_string(&results).expect("json serialization should not fail"); + Ok(json_output) } // The query failed with an error Either::Left((Err(e), __not_yet_cancelled)) => { @@ -661,7 +650,10 @@ impl QueryData { // query successed before it was cancelled. Ok(Ok((status, results))) => { discard.check_idle(status); - Ok(results) + + let json_output = serde_json::to_string(&results) + .expect("json serialization should not fail"); + Ok(json_output) } // query failed or was cancelled. Ok(Err(error)) => { @@ -695,7 +687,7 @@ impl BatchQueryData { cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, - ) -> Result { + ) -> Result { info!("starting transaction"); let (inner, mut discard) = client.inner(); let cancel_token = inner.cancel_token(); @@ -717,9 +709,9 @@ impl BatchQueryData { e })?; - let results = + let json_output = match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { - Ok(results) => { + Ok(json_output) => { info!("commit"); let status = transaction.commit().await.map_err(|e| { // if we cannot commit - for now don't return connection to pool @@ -728,7 +720,7 @@ impl BatchQueryData { e })?; discard.check_idle(status); - results + json_output } Err(SqlOverHttpError::Cancelled(_)) => { if let Err(err) = cancel_token.cancel_query(NoTls).await { @@ -752,7 +744,7 @@ impl BatchQueryData { } }; - Ok(json!({ "results": results })) + Ok(json_output) } } @@ -761,7 +753,7 @@ async fn query_batch( transaction: &Transaction<'_>, queries: BatchQueryData, parsed_headers: HttpHeaders, -) -> Result, SqlOverHttpError> { +) -> Result { let mut results = Vec::with_capacity(queries.queries.len()); let mut current_size = 0; for stmt in queries.queries { @@ -786,7 +778,11 @@ async fn query_batch( } } } - Ok(results) + + let results = json!({ "results": results }); + let json_output = serde_json::to_string(&results).expect("json serialization should not fail"); + + Ok(json_output) } async fn query_to_json( @@ -794,7 +790,7 @@ async fn query_to_json( data: QueryData, current_size: &mut usize, parsed_headers: HttpHeaders, -) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> { +) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> { info!("executing query"); let query_params = data.params; let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?); @@ -843,8 +839,8 @@ async fn query_to_json( for c in row_stream.columns() { fields.push(json!({ - "name": Value::String(c.name().to_owned()), - "dataTypeID": Value::Number(c.type_().oid().into()), + "name": c.name().to_owned(), + "dataTypeID": c.type_().oid(), "tableID": c.table_oid(), "columnID": c.column_id(), "dataTypeSize": c.type_size(), @@ -862,15 +858,14 @@ async fn query_to_json( .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode)) .collect::, _>>()?; - // resulting JSON format is based on the format of node-postgres result - Ok(( - ready, - json!({ - "command": command_tag_name, - "rowCount": command_tag_count, - "rows": rows, - "fields": fields, - "rowAsArray": array_mode, - }), - )) + // Resulting JSON format is based on the format of node-postgres result. + let results = json!({ + "command": command_tag_name.to_string(), + "rowCount": command_tag_count, + "rows": rows, + "fields": fields, + "rowAsArray": array_mode, + }); + + Ok((ready, results)) } diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 56ed2145dc..a8735fe0bb 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -357,11 +357,15 @@ pub async fn task_backup( info!("metrics backup has shut down"); } // Even if the remote storage is not configured, we still want to clear the metrics. - let storage = backup_config - .remote_storage_config - .as_ref() - .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init")) - .transpose()?; + let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() { + Some( + GenericRemoteStorage::from_config(config) + .await + .context("remote storage init")?, + ) + } else { + None + }; let mut ticker = tokio::time::interval(backup_config.interval); let mut prev = Utc::now(); let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index bba5494cfe..888ad38048 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -111,7 +111,7 @@ mod tests { let waiters = Arc::clone(&waiters); let notifier = tokio::spawn(async move { - waiters.notify(key, Default::default())?; + waiters.notify(key, ())?; Ok(()) }); diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index a650d5e207..0fdb3147bf 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -23,7 +23,6 @@ clap = { workspace = true, features = ["derive"] } const_format.workspace = true crc32c.workspace = true fail.workspace = true -fs2.workspace = true git-version.workspace = true hex.workspace = true humantime.workspace = true @@ -41,6 +40,8 @@ serde.workspace = true serde_json.workspace = true serde_with.workspace = true signal-hook.workspace = true +strum.workspace = true +strum_macros.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["fs"] } tokio-util = { workspace = true } diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index dd9058c468..b8bc3f3e06 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -12,13 +12,15 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } Ok(()) } - (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError( - format!( - "JWT scope '{:?}' is ineligible for Safekeeper auth", - claims.scope - ) - .into(), - )), + (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => { + Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Safekeeper auth", + claims.scope + ) + .into(), + )) + } (Scope::SafekeeperData, _) => Ok(()), } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 9eb6546d6b..2365fd0587 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -418,7 +418,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - wal_backup::init_remote_storage(&conf); + wal_backup::init_remote_storage(&conf).await; // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 14bd3c03b8..220988c3ce 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -74,10 +74,16 @@ pub async fn handle_request(request: Request) -> Result<()> { assert!(flush_lsn >= start_lsn); if request.until_lsn > flush_lsn { - bail!("requested LSN is beyond the end of the timeline"); + bail!(format!( + "requested LSN {} is beyond the end of the timeline {}", + request.until_lsn, flush_lsn + )); } if request.until_lsn < start_lsn { - bail!("requested LSN is before the start of the timeline"); + bail!(format!( + "requested LSN {} is before the start of the timeline {}", + request.until_lsn, start_lsn + )); } if request.until_lsn > commit_lsn { diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index af83feb77f..8f2920ada3 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -173,15 +173,6 @@ pub static BROKER_RUNTIME: Lazy = Lazy::new(|| { .expect("Failed to create broker runtime") }); -pub static WAL_REMOVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("WAL remover") - .worker_threads(1) - .enable_all() - .build() - .expect("Failed to create broker runtime") -}); - pub static WAL_BACKUP_RUNTIME: Lazy = Lazy::new(|| { tokio::runtime::Builder::new_multi_thread() .thread_name("WAL backup worker") @@ -189,12 +180,3 @@ pub static WAL_BACKUP_RUNTIME: Lazy = Lazy::new(|| { .build() .expect("Failed to create WAL backup runtime") }); - -pub static METRICS_SHIFTER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("metric shifter") - .worker_threads(1) - .enable_all() - .build() - .expect("Failed to create broker runtime") -}); diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 539ecf826b..aa2bafbe92 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -205,6 +205,32 @@ pub static WAL_BACKUP_TASKS: Lazy = Lazy::new(|| { .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter") }); +// Metrics collected on operations on the storage repository. +#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum EvictionEvent { + Evict, + Restore, +} + +pub(crate) static EVICTION_EVENTS_STARTED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_eviction_events_started_total", + "Number of eviction state changes, incremented when they start", + &["kind"] + ) + .expect("Failed to register metric") +}); + +pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_eviction_events_completed_total", + "Number of eviction state changes, incremented when they complete", + &["kind"] + ) + .expect("Failed to register metric") +}); + pub const LABEL_UNKNOWN: &str = "unknown"; /// Labels for traffic metrics. diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index e4ab65290d..7947d83eb4 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -14,6 +14,7 @@ use tracing::{debug, info, instrument, warn}; use utils::crashsafe::durable_rename; use crate::{ + metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED}, timeline_manager::{Manager, StateSnapshot}, wal_backup, wal_backup_partial::{self, PartialRemoteSegment}, @@ -66,6 +67,15 @@ impl Manager { info!("starting eviction, using {:?}", partial_backup_uploaded); + EVICTION_EVENTS_STARTED + .with_label_values(&[EvictionEvent::Evict.into()]) + .inc(); + let _guard = scopeguard::guard((), |_| { + EVICTION_EVENTS_COMPLETED + .with_label_values(&[EvictionEvent::Evict.into()]) + .inc(); + }); + if let Err(e) = do_eviction(self, &partial_backup_uploaded).await { warn!("failed to evict timeline: {:?}", e); return; @@ -88,6 +98,15 @@ impl Manager { info!("starting uneviction, using {:?}", partial_backup_uploaded); + EVICTION_EVENTS_STARTED + .with_label_values(&[EvictionEvent::Restore.into()]) + .inc(); + let _guard = scopeguard::guard((), |_| { + EVICTION_EVENTS_COMPLETED + .with_label_values(&[EvictionEvent::Restore.into()]) + .inc(); + }); + if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await { warn!("failed to unevict timeline: {:?}", e); return; @@ -180,10 +199,7 @@ async fn redownload_partial_segment( file.flush().await?; let final_path = local_segment_path(mgr, partial); - info!( - "downloaded {} bytes, renaming to {}", - final_path, final_path, - ); + info!("downloaded {actual_len} bytes, renaming to {final_path}"); if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await { // Probably rename succeeded, but fsync of it failed. Remove // the file then to avoid using it. diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs index e249c859b4..dbdf46412d 100644 --- a/safekeeper/src/timeline_guard.rs +++ b/safekeeper/src/timeline_guard.rs @@ -4,7 +4,7 @@ use std::collections::HashSet; -use tracing::{debug, warn}; +use tracing::debug; use crate::timeline_manager::ManagerCtlMessage; @@ -23,7 +23,7 @@ impl Drop for ResidenceGuard { .manager_tx .send(ManagerCtlMessage::GuardDrop(self.guard_id)); if let Err(e) = res { - warn!("failed to send GuardDrop message: {:?}", e); + debug!("failed to send GuardDrop message: {:?}", e); } } } diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 9ea048a3c7..7ecee178f3 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -22,7 +22,7 @@ use tokio::fs::File; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::watch; +use tokio::sync::{watch, OnceCell}; use tokio::time::sleep; use tracing::*; @@ -33,8 +33,6 @@ use crate::timeline::{PeerInfo, WalResidentTimeline}; use crate::timeline_manager::{Manager, StateSnapshot}; use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME}; -use once_cell::sync::OnceCell; - const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; @@ -119,6 +117,7 @@ async fn shut_down_task(entry: &mut Option) { /// time we have several ones as they PUT the same files. Also, /// - frequently changing the offloader would be bad; /// - electing seriously lagging safekeeper is undesirable; +/// /// So we deterministically choose among the reasonably caught up candidates. /// TODO: take into account failed attempts to deal with hypothetical situation /// where s3 is unreachable only for some sks. @@ -166,7 +165,7 @@ fn determine_offloader( } } -static REMOTE_STORAGE: OnceCell> = OnceCell::new(); +static REMOTE_STORAGE: OnceCell> = OnceCell::const_new(); // Storage must be configured and initialized when this is called. fn get_configured_remote_storage() -> &'static GenericRemoteStorage { @@ -177,14 +176,22 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage { .unwrap() } -pub fn init_remote_storage(conf: &SafeKeeperConf) { +pub async fn init_remote_storage(conf: &SafeKeeperConf) { // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide // dependencies to all tasks instead. - REMOTE_STORAGE.get_or_init(|| { - conf.remote_storage - .as_ref() - .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) - }); + REMOTE_STORAGE + .get_or_init(|| async { + if let Some(conf) = conf.remote_storage.as_ref() { + Some( + GenericRemoteStorage::from_config(conf) + .await + .expect("failed to create remote storage"), + ) + } else { + None + } + }) + .await; } struct WalBackupTask { diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 825851c97c..b1efa9749f 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -289,6 +289,18 @@ impl PartialBackup { }) .collect(); + if new_segments.len() == 1 { + // we have an uploaded segment, it must not be deleted from remote storage + segments_to_delete.retain(|name| name != &new_segments[0].name); + } else { + // there should always be zero or one uploaded segment + assert!( + new_segments.is_empty(), + "too many uploaded segments: {:?}", + new_segments + ); + } + info!("deleting objects: {:?}", segments_to_delete); let mut objects_to_delete = vec![]; for seg in segments_to_delete.iter() { diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml new file mode 100644 index 0000000000..c3bfe2bfd2 --- /dev/null +++ b/storage_controller/client/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "storage_controller_client" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +pageserver_api.workspace = true +pageserver_client.workspace = true +thiserror.workspace = true +async-trait.workspace = true +reqwest.workspace = true +utils.workspace = true +serde.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } +tokio-postgres.workspace = true +tokio-stream.workspace = true +tokio.workspace = true +futures.workspace = true +tokio-util.workspace = true +anyhow.workspace = true +postgres.workspace = true +bytes.workspace = true diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs new file mode 100644 index 0000000000..a981b5020e --- /dev/null +++ b/storage_controller/client/src/control_api.rs @@ -0,0 +1,62 @@ +use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; +use reqwest::{Method, Url}; +use serde::{de::DeserializeOwned, Serialize}; +use std::str::FromStr; + +pub struct Client { + base_url: Url, + jwt_token: Option, + client: reqwest::Client, +} + +impl Client { + pub fn new(base_url: Url, jwt_token: Option) -> Self { + Self { + base_url, + jwt_token, + client: reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"), + } + } + + /// Simple HTTP request wrapper for calling into storage controller + pub async fn dispatch( + &self, + method: Method, + path: String, + body: Option, + ) -> mgmt_api::Result + where + RQ: Serialize + Sized, + RS: DeserializeOwned + Sized, + { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let url = Url::from_str(&format!( + "http://{}:{}/{path}", + self.base_url.host_str().unwrap(), + self.base_url.port().unwrap() + )) + .unwrap(); + + let mut builder = self.client.request(method, url); + if let Some(body) = body { + builder = builder.json(&body) + } + if let Some(jwt_token) = &self.jwt_token { + builder = builder.header( + reqwest::header::AUTHORIZATION, + format!("Bearer {jwt_token}"), + ); + } + + let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?; + let response = response.error_from_body().await?; + + response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody) + } +} diff --git a/storage_controller/client/src/lib.rs b/storage_controller/client/src/lib.rs new file mode 100644 index 0000000000..6d5e202942 --- /dev/null +++ b/storage_controller/client/src/lib.rs @@ -0,0 +1 @@ +pub mod control_api; diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 7446ad53a2..8fb4be93e0 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -330,6 +330,22 @@ async fn handle_tenant_timeline_delete( .await } +async fn handle_tenant_timeline_detach_ancestor( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let res = service + .tenant_timeline_detach_ancestor(tenant_id, timeline_id) + .await?; + + json_response(StatusCode::OK, res) +} + async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, @@ -414,7 +430,7 @@ async fn handle_tenant_describe( service: Arc, req: Request, ) -> Result, ApiError> { - check_permissions(&req, Scope::Admin)?; + check_permissions(&req, Scope::Scrubber)?; let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) @@ -456,6 +472,14 @@ async fn handle_node_drop(req: Request) -> Result, ApiError json_response(StatusCode::OK, state.service.node_drop(node_id).await?) } +async fn handle_node_delete(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + json_response(StatusCode::OK, state.service.node_delete(node_id).await?) +} + async fn handle_node_configure(mut req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -878,6 +902,9 @@ pub fn make_router( .post("/control/v1/node", |r| { named_request_span(r, handle_node_register, RequestName("control_v1_node")) }) + .delete("/control/v1/node/:node_id", |r| { + named_request_span(r, handle_node_delete, RequestName("control_v1_node_delete")) + }) .get("/control/v1/node", |r| { named_request_span(r, handle_node_list, RequestName("control_v1_node")) }) @@ -995,6 +1022,16 @@ pub fn make_router( RequestName("v1_tenant_timeline"), ) }) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_detach_ancestor, + RequestName("v1_tenant_timeline_detach_ancestor"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index f1eb0b30fc..789f96beb3 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,11 +1,11 @@ use anyhow::{anyhow, Context}; -use camino::Utf8PathBuf; use clap::Parser; use diesel::Connection; use metrics::launch_timestamp::LaunchTimestamp; use metrics::BuildInfo; use std::path::PathBuf; use std::sync::Arc; +use std::time::Duration; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; @@ -51,10 +51,6 @@ struct Cli { #[arg(long)] compute_hook_url: Option, - /// Path to the .json file to store state (will be created if it doesn't exist) - #[arg(short, long)] - path: Option, - /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller #[arg(long)] database_url: Option, @@ -206,11 +202,10 @@ async fn async_main() -> anyhow::Result<()> { let args = Cli::parse(); tracing::info!( - "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}", + "version: {}, launch_timestamp: {}, build_tag {}, listening on {}", GIT_VERSION, launch_ts.to_string(), BUILD_TAG, - args.path.as_ref().unwrap_or(&Utf8PathBuf::from("")), args.listen ); @@ -277,8 +272,7 @@ async fn async_main() -> anyhow::Result<()> { .await .context("Running database migrations")?; - let json_path = args.path; - let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone())); + let persistence = Arc::new(Persistence::new(secrets.database_url)); let service = Service::spawn(config, persistence.clone()).await?; @@ -316,22 +310,23 @@ async fn async_main() -> anyhow::Result<()> { } tracing::info!("Terminating on signal"); - if json_path.is_some() { - // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing - // full postgres dumps around. - if let Err(e) = persistence.write_tenants_json().await { - tracing::error!("Failed to write JSON on shutdown: {e}") + // Stop HTTP server first, so that we don't have to service requests + // while shutting down Service. + server_shutdown.cancel(); + match tokio::time::timeout(Duration::from_secs(5), server_task).await { + Ok(Ok(_)) => { + tracing::info!("Joined HTTP server task"); + } + Ok(Err(e)) => { + tracing::error!("Error joining HTTP server task: {e}") + } + Err(_) => { + tracing::warn!("Timed out joining HTTP server task"); + // We will fall through and shut down the service anyway, any request handlers + // in flight will experience cancellation & their clients will see a torn connection. } } - // Stop HTTP server first, so that we don't have to service requests - // while shutting down Service - server_shutdown.cancel(); - if let Err(e) = server_task.await { - tracing::error!("Error joining HTTP server task: {e}") - } - tracing::info!("Joined HTTP server task"); - service.shutdown().await; tracing::info!("Service shutdown complete"); diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 769aba80ca..8d64201cd9 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,8 +1,9 @@ use pageserver_api::{ models::{ - LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, - TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, - TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, + detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, + PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, + TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, + TopTenantShardsRequest, TopTenantShardsResponse, }, shard::TenantShardId, }; @@ -226,6 +227,21 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_detach_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "timeline_detach_ancestor", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 47caf7ae81..d8f31e86e5 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -5,8 +5,6 @@ use std::time::Duration; use std::time::Instant; use self::split_state::SplitState; -use camino::Utf8Path; -use camino::Utf8PathBuf; use diesel::pg::PgConnection; use diesel::prelude::*; use diesel::Connection; @@ -55,11 +53,6 @@ use crate::node::Node; /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline. pub struct Persistence { connection_pool: diesel::r2d2::Pool>, - - // In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of - // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward - // compatible just yet. - json_path: Option, } /// Legacy format, for use in JSON compat objects in test environment @@ -124,7 +117,7 @@ impl Persistence { const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10); const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60); - pub fn new(database_url: String, json_path: Option) -> Self { + pub fn new(database_url: String) -> Self { let manager = diesel::r2d2::ConnectionManager::::new(database_url); // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time @@ -139,10 +132,7 @@ impl Persistence { .build(manager) .expect("Could not build connection pool"); - Self { - connection_pool, - json_path, - } + Self { connection_pool } } /// A helper for use during startup, where we would like to tolerate concurrent restarts of the @@ -302,85 +292,13 @@ impl Persistence { /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { - let loaded = self - .with_measured_conn( - DatabaseOperation::ListTenantShards, - move |conn| -> DatabaseResult<_> { - Ok(crate::schema::tenant_shards::table.load::(conn)?) - }, - ) - .await?; - - if loaded.is_empty() { - if let Some(path) = &self.json_path { - if tokio::fs::try_exists(path) - .await - .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))? - { - tracing::info!("Importing from legacy JSON format at {path}"); - return self.list_tenant_shards_json(path).await; - } - } - } - Ok(loaded) - } - - /// Shim for automated compatibility tests: load tenants from a JSON file instead of database - pub(crate) async fn list_tenant_shards_json( - &self, - path: &Utf8Path, - ) -> DatabaseResult> { - let bytes = tokio::fs::read(path) - .await - .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?; - - let mut decoded = serde_json::from_slice::(&bytes) - .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?; - for shard in decoded.tenants.values_mut() { - if shard.placement_policy == "\"Single\"" { - // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165 - shard.placement_policy = "{\"Attached\":0}".to_string(); - } - - if shard.scheduling_policy.is_empty() { - shard.scheduling_policy = - serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap(); - } - } - - let tenants: Vec = decoded.tenants.into_values().collect(); - - // Synchronize database with what is in the JSON file - self.insert_tenant_shards(tenants.clone()).await?; - - Ok(tenants) - } - - /// For use in testing environments, where we dump out JSON on shutdown. - pub async fn write_tenants_json(&self) -> anyhow::Result<()> { - let Some(path) = &self.json_path else { - anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)"); - }; - tracing::info!("Writing state to {path}..."); - let tenants = self.list_tenant_shards().await?; - let mut tenants_map = HashMap::new(); - for tsp in tenants { - let tenant_shard_id = TenantShardId { - tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, - shard_number: ShardNumber(tsp.shard_number as u8), - shard_count: ShardCount::new(tsp.shard_count as u8), - }; - - tenants_map.insert(tenant_shard_id, tsp); - } - let json = serde_json::to_string(&JsonPersistence { - tenants: tenants_map, - })?; - - tokio::fs::write(path, &json).await?; - tracing::info!("Wrote {} bytes to {path}...", json.len()); - - Ok(()) + self.with_measured_conn( + DatabaseOperation::ListTenantShards, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::tenant_shards::table.load::(conn)?) + }, + ) + .await } /// Tenants must be persisted before we schedule them for the first time. This enables us @@ -542,6 +460,7 @@ impl Persistence { Ok(Generation::new(g as u32)) } + #[allow(non_local_definitions)] /// For use when updating a persistent property of a tenant, such as its config or placement_policy. /// /// Do not use this for settting generation, unless in the special onboarding code path (/location_config) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index aada1939ee..2a6d5d3578 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2,6 +2,7 @@ use std::{ borrow::Cow, cmp::Ordering, collections::{BTreeMap, HashMap, HashSet}, + ops::Deref, path::PathBuf, str::FromStr, sync::Arc, @@ -115,12 +116,15 @@ enum TenantOperations { SecondaryDownload, TimelineCreate, TimelineDelete, + AttachHook, + TimelineDetachAncestor, } #[derive(Clone, strum_macros::Display)] enum NodeOperations { Register, Configure, + Delete, } pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; @@ -824,6 +828,8 @@ impl Service { ); } Err(err) => { + // Transition to active involves reconciling: if a node responds to a heartbeat then + // becomes unavailable again, we may get an error here. tracing::error!( "Failed to update node {} after heartbeat round: {}", node_id, @@ -845,9 +851,10 @@ impl Service { tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), sequence=%result.sequence ))] - fn process_result(&self, result: ReconcileResult) { + fn process_result(&self, mut result: ReconcileResult) { let mut locked = self.inner.write().unwrap(); - let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else { + let (nodes, tenants, _scheduler) = locked.parts_mut(); + let Some(tenant) = tenants.get_mut(&result.tenant_shard_id) else { // A reconciliation result might race with removing a tenant: drop results for // tenants that aren't in our map. return; @@ -864,6 +871,13 @@ impl Service { // Let the TenantShard know it is idle. tenant.reconcile_complete(result.sequence); + // In case a node was deleted while this reconcile is in flight, filter it out of the update we will + // make to the tenant + result + .observed + .locations + .retain(|node_id, _loc| nodes.contains_key(node_id)); + match result.result { Ok(()) => { for (node_id, loc) in &result.observed.locations { @@ -873,6 +887,7 @@ impl Service { tracing::info!("Setting observed location {} to None", node_id,) } } + tenant.observed = result.observed; tenant.waiter.advance(result.sequence); } @@ -1109,8 +1124,16 @@ impl Service { // We will populate intent properly later in [`Self::startup_reconcile`], initially populate // it with what we can infer: the node for which a generation was most recently issued. let mut intent = IntentState::new(); - if let Some(generation_pageserver) = tsp.generation_pageserver { - intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64))); + if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64)) + { + if nodes.contains_key(&generation_pageserver) { + intent.set_attached(&mut scheduler, Some(generation_pageserver)); + } else { + // If a node was removed before being completely drained, it is legal for it to leave behind a `generation_pageserver` referring + // to a non-existent node, because node deletion doesn't block on completing the reconciliations that will issue new generations + // on different pageservers. + tracing::warn!("Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled"); + } } let new_tenant = TenantShard::from_persistent(tsp, intent)?; @@ -1237,7 +1260,7 @@ impl Service { let _tenant_lock = trace_exclusive_lock( &self.tenant_op_locks, attach_req.tenant_shard_id.tenant_id, - TenantOperations::ShardSplit, + TenantOperations::AttachHook, ) .await; @@ -2356,18 +2379,18 @@ impl Service { tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); client - .tenant_time_travel_remote_storage( - tenant_shard_id, - ×tamp, - &done_if_after, - ) - .await - .map_err(|e| { - ApiError::InternalServerError(anyhow::anyhow!( - "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", - node - )) - })?; + .tenant_time_travel_remote_storage( + tenant_shard_id, + ×tamp, + &done_if_after, + ) + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", + node + )) + })?; } } Ok(()) @@ -2737,7 +2760,7 @@ impl Service { // Create timeline on remaining shards with number >0 if !targets.is_empty() { // If we had multiple shards, issue requests for the remainder now. - let jwt = self.config.jwt_token.clone(); + let jwt = &self.config.jwt_token; self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { let create_req = create_req.clone(); Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req)) @@ -2748,6 +2771,114 @@ impl Service { Ok(timeline_info) } + pub(crate) async fn tenant_timeline_detach_ancestor( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineDetachAncestor, + ) + .await; + + self.ensure_attached_wait(tenant_id).await?; + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let node_id = shard.intent.get_attached().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) + })?; + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + targets + }; + + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + + async fn detach_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { + tracing::info!( + "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); + + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + client + .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .await + .map_err(|e| { + use mgmt_api::Error; + + match e { + // no ancestor (ever) + Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!( + "{node}: {}", + msg.strip_prefix("Conflict: ").unwrap_or(&msg) + )), + // too many ancestors + Error::ApiError(StatusCode::BAD_REQUEST, msg) => { + ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}")) + } + // rest can be mapped + other => passthrough_api_error(&node, other), + } + }) + .map(|res| (tenant_shard_id.shard_number, res)) + } + + // no shard needs to go first/last; the operation should be idempotent + // TODO: it would be great to ensure that all shards return the same error + let mut results = self + .tenant_for_shards(targets, |tenant_shard_id, node| { + futures::FutureExt::boxed(detach_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + )) + }) + .await?; + + let any = results.pop().expect("we must have at least one response"); + + let mismatching = results + .iter() + .filter(|(_, res)| res != &any.1) + .collect::>(); + if !mismatching.is_empty() { + let matching = results.len() - mismatching.len(); + tracing::error!( + matching, + compared_against=?any, + ?mismatching, + "shards returned different results" + ); + } + + Ok(any.1) + } + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// /// On success, the returned vector contains exactly the same number of elements as the input `locations`. @@ -2874,8 +3005,8 @@ impl Service { .await .map_err(|e| { ApiError::InternalServerError(anyhow::anyhow!( - "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", - )) + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + )) }) } @@ -3827,6 +3958,8 @@ impl Service { "failpoint".to_string() ))); + failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel); + tracing::info!( "Split {} into {}", parent_id, @@ -4210,8 +4343,6 @@ impl Service { /// This is for debug/support only: we simply drop all state for a tenant, without /// detaching or deleting it on pageservers. We do not try and re-schedule any /// tenants that were on this node. - /// - /// TODO: proper node deletion API that unhooks things more gracefully pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> { self.persistence.delete_node(node_id).await?; @@ -4219,6 +4350,7 @@ impl Service { for shard in locked.tenants.values_mut() { shard.deref_node(node_id); + shard.observed.locations.remove(&node_id); } let mut nodes = (*locked.nodes).clone(); @@ -4230,6 +4362,94 @@ impl Service { Ok(()) } + /// If a node has any work on it, it will be rescheduled: this is "clean" in the sense + /// that we don't leave any bad state behind in the storage controller, but unclean + /// in the sense that we are not carefully draining the node. + pub(crate) async fn node_delete(&self, node_id: NodeId) -> Result<(), ApiError> { + let _node_lock = + trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Delete).await; + + // 1. Atomically update in-memory state: + // - set the scheduling state to Pause to make subsequent scheduling ops skip it + // - update shards' intents to exclude the node, and reschedule any shards whose intents we modified. + // - drop the node from the main nodes map, so that when running reconciles complete they do not + // re-insert references to this node into the ObservedState of shards + // - drop the node from the scheduler + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + { + let mut nodes_mut = (*nodes).deref().clone(); + match nodes_mut.get_mut(&node_id) { + Some(node) => { + // We do not bother setting this in the database, because we're about to delete the row anyway, and + // if we crash it would not be desirable to leave the node paused after a restart. + node.set_scheduling(NodeSchedulingPolicy::Pause); + } + None => { + tracing::info!( + "Node not found: presuming this is a retry and returning success" + ); + return Ok(()); + } + } + + *nodes = Arc::new(nodes_mut); + } + + for (tenant_shard_id, shard) in tenants { + if shard.deref_node(node_id) { + // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise + // it won't properly do anti-affinity. + let mut schedule_context = ScheduleContext::default(); + + if let Err(e) = shard.schedule(scheduler, &mut schedule_context) { + // TODO: implement force flag to remove a node even if we can't reschedule + // a tenant + tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}"); + return Err(e.into()); + } else { + tracing::info!( + "Rescheduled shard {tenant_shard_id} away from node during deletion" + ) + } + + self.maybe_reconcile_shard(shard, nodes); + } + + // Here we remove an existing observed location for the node we're removing, and it will + // not be re-added by a reconciler's completion because we filter out removed nodes in + // process_result. + // + // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that + // means any reconciles we spawned will know about the node we're deleting, enabling them + // to do live migrations if it's still online. + shard.observed.locations.remove(&node_id); + } + + scheduler.node_remove(node_id); + + { + let mut nodes_mut = (**nodes).clone(); + nodes_mut.remove(&node_id); + *nodes = Arc::new(nodes_mut); + } + } + + // Note: some `generation_pageserver` columns on tenant shards in the database may still refer to + // the removed node, as this column means "The pageserver to which this generation was issued", and + // their generations won't get updated until the reconcilers moving them away from this node complete. + // That is safe because in Service::spawn we only use generation_pageserver if it refers to a node + // that exists. + + // 2. Actually delete the node from the database and from in-memory state + tracing::info!("Deleting node from database"); + self.persistence.delete_node(node_id).await?; + + Ok(()) + } + pub(crate) async fn node_list(&self) -> Result, ApiError> { let nodes = { self.inner @@ -4963,7 +5183,7 @@ impl Service { /// we did the split, but are probably better placed elsewhere. /// - Creating new secondary locations if it improves the spreading of a sharded tenant /// * e.g. after a shard split, some locations will be on the same node (where the split - /// happened), and will probably be better placed elsewhere. + /// happened), and will probably be better placed elsewhere. /// /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at /// the time of scheduling, this function looks for cases where a better-scoring location is available @@ -5526,14 +5746,14 @@ impl Service { /// Create a node fill plan (pick secondaries to promote) that meets the following requirements: /// 1. The node should be filled until it reaches the expected cluster average of - /// attached shards. If there are not enough secondaries on the node, the plan stops early. + /// attached shards. If there are not enough secondaries on the node, the plan stops early. /// 2. Select tenant shards to promote such that the number of attached shards is balanced - /// throughout the cluster. We achieve this by picking tenant shards from each node, - /// starting from the ones with the largest number of attached shards, until the node - /// reaches the expected cluster average. + /// throughout the cluster. We achieve this by picking tenant shards from each node, + /// starting from the ones with the largest number of attached shards, until the node + /// reaches the expected cluster average. /// 3. Avoid promoting more shards of the same tenant than required. The upper bound - /// for the number of tenants from the same shard promoted to the node being filled is: - /// shard count for the tenant divided by the number of nodes in the cluster. + /// for the number of tenants from the same shard promoted to the node being filled is: + /// shard count for the tenant divided by the number of nodes in the cluster. fn fill_node_plan(&self, node_id: NodeId) -> Vec { let mut locked = self.inner.write().unwrap(); let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 3fcf31ac10..ee2ba6c4ee 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -124,6 +124,7 @@ pub(crate) struct TenantShard { /// - ReconcileWaiters need to Arc-clone the overall object to read it later /// - ReconcileWaitError needs to use an `Arc` because we can construct /// many waiters for one shard, and the underlying error types are not Clone. + /// /// TODO: generalize to an array of recent events /// TOOD: use a ArcSwap instead of mutex for faster reads? #[serde(serialize_with = "read_last_error")] @@ -383,9 +384,9 @@ impl ReconcilerWaiter { } pub(crate) fn get_status(&self) -> ReconcilerStatus { - if self.seq_wait.would_wait_for(self.seq).is_err() { + if self.seq_wait.would_wait_for(self.seq).is_ok() { ReconcilerStatus::Done - } else if self.error_seq_wait.would_wait_for(self.seq).is_err() { + } else if self.error_seq_wait.would_wait_for(self.seq).is_ok() { ReconcilerStatus::Failed } else { ReconcilerStatus::InProgress @@ -1229,18 +1230,27 @@ impl TenantShard { } } - // If we had any state at all referring to this node ID, drop it. Does not - // attempt to reschedule. - pub(crate) fn deref_node(&mut self, node_id: NodeId) { + /// If we had any state at all referring to this node ID, drop it. Does not + /// attempt to reschedule. + /// + /// Returns true if we modified the node's intent state. + pub(crate) fn deref_node(&mut self, node_id: NodeId) -> bool { + let mut intent_modified = false; + + // Drop if this node was our attached intent if self.intent.attached == Some(node_id) { self.intent.attached = None; + intent_modified = true; } + // Drop from the list of secondaries, and check if we modified it + let had_secondaries = self.intent.secondary.len(); self.intent.secondary.retain(|n| n != &node_id); - - self.observed.locations.remove(&node_id); + intent_modified |= self.intent.secondary.len() != had_secondaries; debug_assert!(!self.intent.all_pageservers().contains(&node_id)); + + intent_modified } pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) { diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index 050be66483..7d5b7d10b9 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -34,6 +34,7 @@ camino.workspace = true rustls.workspace = true rustls-native-certs.workspace = true once_cell.workspace = true +storage_controller_client.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } chrono = { workspace = true, default-features = false, features = ["clock", "serde"] } @@ -48,6 +49,5 @@ tracing.workspace = true tracing-subscriber.workspace = true clap.workspace = true tracing-appender = "0.2" -histogram = "0.7" futures.workspace = true diff --git a/storage_scrubber/README.md b/storage_scrubber/README.md index 0930f343ec..9fbd92feef 100644 --- a/storage_scrubber/README.md +++ b/storage_scrubber/README.md @@ -45,7 +45,11 @@ processing by the `purge-garbage` subcommand. Example: -`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=[client_key] CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` + +Note that `CLOUD_ADMIN_API_TOKEN` can be obtained from https://console-stage.neon.build/app/settings/api-keys (for staging) or https://console.neon.tech/app/settings/api-keys for production. This is not the control plane admin JWT key. The env var name is confusing. Though anyone can generate that API key, you still need admin permission in order to access all projects in the region. + +And note that `CLOUD_ADMIN_API_URL` should include the region in the admin URL due to the control plane / console split. For example, `https://console-stage.neon.build/regions/aws-us-east-2/api/v1/admin` for the staging us-east-2 region. #### `purge-garbage` @@ -61,7 +65,7 @@ to pass them on the command line Example: -`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev cargo run --release -- purge-garbage --input-path=eu-west-1-garbage.json` Add the `--delete` argument before `purge-garbage` to enable deletion. This is intentionally not provided inline in the example above to avoid accidents. Without the `--delete` flag diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index f687b24320..421a848f67 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -2,6 +2,7 @@ use std::collections::{HashMap, HashSet}; use anyhow::Context; use aws_sdk_s3::Client; +use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver_api::shard::ShardIndex; use tracing::{error, info, warn}; @@ -12,7 +13,7 @@ use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; use futures_util::StreamExt; -use pageserver::tenant::remote_timeline_client::parse_remote_index_path; +use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use remote_storage::RemotePath; @@ -41,7 +42,9 @@ impl TimelineAnalysis { } } -pub(crate) fn branch_cleanup_and_check_errors( +pub(crate) async fn branch_cleanup_and_check_errors( + s3_client: &Client, + target: &RootTarget, id: &TenantShardTimelineId, tenant_objects: &mut TenantObjectListing, s3_active_branch: Option<&BranchData>, @@ -85,15 +88,17 @@ pub(crate) fn branch_cleanup_and_check_errors( } if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() { - result.warnings.push(format!( + info!( "index_part.json version is not latest: {}", index_part.version() - )) + ); } if index_part.metadata.disk_consistent_lsn() != index_part.duplicated_disk_consistent_lsn() { + // Tech debt: let's get rid of one of these, they are redundant + // https://github.com/neondatabase/neon/issues/8343 result.errors.push(format!( "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})", index_part.metadata.disk_consistent_lsn(), @@ -102,8 +107,16 @@ pub(crate) fn branch_cleanup_and_check_errors( } if index_part.layer_metadata.is_empty() { - // not an error, can happen for branches with zero writes, but notice that - info!("index_part.json has no layers"); + if index_part.metadata.ancestor_timeline().is_none() { + // The initial timeline with no ancestor should ALWAYS have layers. + result.errors.push( + "index_part.json has no layers (ancestor_timeline=None)" + .to_string(), + ); + } else { + // Not an error, can happen for branches with zero writes, but notice that + info!("index_part.json has no layers (ancestor_timeline exists)"); + } } for (layer, metadata) in index_part.layer_metadata { @@ -114,16 +127,41 @@ pub(crate) fn branch_cleanup_and_check_errors( } if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) { - // FIXME: this will emit false positives if an index was - // uploaded concurrently with our scan. To make this check - // correct, we need to try sending a HEAD request for the - // layer we think is missing. - result.errors.push(format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage", - layer, - metadata.generation.get_suffix(), - metadata.shard - )) + let path = remote_layer_path( + &id.tenant_shard_id.tenant_id, + &id.timeline_id, + metadata.shard, + &layer, + metadata.generation, + ); + + // HEAD request used here to address a race condition when an index was uploaded concurrently + // with our scan. We check if the object is uploaded to S3 after taking the listing snapshot. + let response = s3_client + .head_object() + .bucket(target.bucket_name()) + .key(path.get_path().as_str()) + .send() + .await; + + if response.is_err() { + // Object is not present. + let is_l0 = LayerMap::is_l0(layer.key_range()); + + let msg = format!( + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + layer, + metadata.generation.get_suffix(), + metadata.shard, + is_l0, + ); + + if is_l0 { + result.warnings.push(msg); + } else { + result.errors.push(msg); + } + } } } } @@ -303,6 +341,9 @@ pub(crate) async fn list_timeline_blobs( tracing::debug!("initdb archive {key}"); initdb_archive = true; } + Some("initdb-preserved.tar.zst") => { + tracing::info!("initdb archive preserved {key}"); + } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { Ok((new_layer, gen)) => { tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen); diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index 0450851988..c7e21d7e26 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -8,21 +8,19 @@ use std::{ }; use anyhow::Context; -use aws_sdk_s3::{ - types::{Delete, ObjectIdentifier}, - Client, -}; use futures_util::TryStreamExt; use pageserver_api::shard::TenantShardId; +use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath}; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; +use tokio_util::sync::CancellationToken; use utils::id::TenantId; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, - init_remote, - metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants}, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth, + init_remote, init_remote_generic, + metadata_stream::{stream_tenant_timelines, stream_tenants}, + BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, }; #[derive(Serialize, Deserialize, Debug)] @@ -324,41 +322,45 @@ impl std::fmt::Display for PurgeMode { } pub async fn get_tenant_objects( - s3_client: &Arc, - target: RootTarget, + s3_client: &GenericRemoteStorage, tenant_shard_id: TenantShardId, -) -> anyhow::Result> { +) -> anyhow::Result> { tracing::debug!("Listing objects in tenant {tenant_shard_id}"); + let tenant_root = super::remote_tenant_path(&tenant_shard_id); + // TODO: apply extra validation based on object modification time. Don't purge // tenants where any timeline's index_part.json has been touched recently. - let mut tenant_root = target.tenant_root(&tenant_shard_id); - - // Remove delimiter, so that object listing lists all keys in the prefix and not just - // common prefixes. - tenant_root.delimiter = String::new(); - - let key_stream = stream_listing(s3_client, &tenant_root); - key_stream.try_collect().await + let list = s3_client + .list( + Some(&tenant_root), + ListingMode::NoDelimiter, + None, + &CancellationToken::new(), + ) + .await?; + Ok(list.keys) } pub async fn get_timeline_objects( - s3_client: &Arc, - target: RootTarget, + s3_client: &GenericRemoteStorage, ttid: TenantShardTimelineId, -) -> anyhow::Result> { +) -> anyhow::Result> { tracing::debug!("Listing objects in timeline {ttid}"); - let mut timeline_root = target.timeline_root(&ttid); + let timeline_root = super::remote_timeline_path_id(&ttid); // TODO: apply extra validation based on object modification time. Don't purge // timelines whose index_part.json has been touched recently. - // Remove delimiter, so that object listing lists all keys in the prefix and not just - // common prefixes. - timeline_root.delimiter = String::new(); - let key_stream = stream_listing(s3_client, &timeline_root); - - key_stream.try_collect().await + let list = s3_client + .list( + Some(&timeline_root), + ListingMode::NoDelimiter, + None, + &CancellationToken::new(), + ) + .await?; + Ok(list.keys) } const MAX_KEYS_PER_DELETE: usize = 1000; @@ -369,16 +371,17 @@ const MAX_KEYS_PER_DELETE: usize = 1000; /// MAX_KEYS_PER_DELETE keys are left. /// `num_deleted` returns number of deleted keys. async fn do_delete( - s3_client: &Arc, - bucket_name: &str, - keys: &mut Vec, + remote_client: &GenericRemoteStorage, + keys: &mut Vec, dry_run: bool, drain: bool, progress_tracker: &mut DeletionProgressTracker, ) -> anyhow::Result<()> { + let cancel = CancellationToken::new(); while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) { let request_keys = keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len()))); + let num_deleted = request_keys.len(); if dry_run { tracing::info!("Dry-run deletion of objects: "); @@ -386,14 +389,10 @@ async fn do_delete( tracing::info!(" {k:?}"); } } else { - let delete_request = s3_client - .delete_objects() - .bucket(bucket_name) - .delete(Delete::builder().set_objects(Some(request_keys)).build()?); - delete_request - .send() + remote_client + .delete_objects(&request_keys, &cancel) .await - .context("DeleteObjects request")?; + .context("deletetion request")?; progress_tracker.register(num_deleted); } } @@ -431,8 +430,13 @@ pub async fn purge_garbage( input_path ); - let (s3_client, target) = - init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?; + let remote_client = + init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?; + + assert_eq!( + &garbage_list.bucket_config.bucket, + remote_client.bucket_name().unwrap() + ); // Sanity checks on the incoming list if garbage_list.active_tenant_count == 0 { @@ -464,16 +468,13 @@ pub async fn purge_garbage( let items = tokio_stream::iter(filtered_items.map(Ok)); let get_objects_results = items.map_ok(|i| { - let s3_client = s3_client.clone(); - let target = target.clone(); + let remote_client = remote_client.clone(); async move { match i.entity { GarbageEntity::Tenant(tenant_id) => { - get_tenant_objects(&s3_client, target, tenant_id).await - } - GarbageEntity::Timeline(ttid) => { - get_timeline_objects(&s3_client, target, ttid).await + get_tenant_objects(&remote_client, tenant_id).await } + GarbageEntity::Timeline(ttid) => get_timeline_objects(&remote_client, ttid).await, } } }); @@ -487,8 +488,7 @@ pub async fn purge_garbage( objects_to_delete.append(&mut object_list); if objects_to_delete.len() >= MAX_KEYS_PER_DELETE { do_delete( - &s3_client, - &garbage_list.bucket_config.bucket, + &remote_client, &mut objects_to_delete, dry_run, false, @@ -499,8 +499,7 @@ pub async fn purge_garbage( } do_delete( - &s3_client, - &garbage_list.bucket_config.bucket, + &remote_client, &mut objects_to_delete, dry_run, true, diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 9102ad9906..5c64e7e459 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -22,8 +22,13 @@ use aws_sdk_s3::Client; use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; +use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver::tenant::TENANTS_SEGMENT_NAME; use pageserver_api::shard::TenantShardId; +use remote_storage::{ + GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, + DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, +}; use reqwest::Url; use serde::{Deserialize, Serialize}; use tokio::io::AsyncReadExt; @@ -31,7 +36,7 @@ use tracing::error; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; use utils::fs_ext; -use utils::id::{TenantId, TimelineId}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; const MAX_RETRIES: usize = 20; const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; @@ -54,7 +59,7 @@ pub struct S3Target { /// in the pageserver, as all timeline objects existing in the scope of a particular /// tenant: the scrubber is different in that it handles collections of data referring to many /// TenantShardTimelineIds in on place. -#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct TenantShardTimelineId { tenant_shard_id: TenantShardId, timeline_id: TimelineId, @@ -67,6 +72,10 @@ impl TenantShardTimelineId { timeline_id, } } + + fn as_tenant_timeline_id(&self) -> TenantTimelineId { + TenantTimelineId::new(self.tenant_shard_id.tenant_id, self.timeline_id) + } } impl Display for TenantShardTimelineId { @@ -179,6 +188,22 @@ impl RootTarget { .with_sub_segment(&id.timeline_id.to_string()) } + /// Given RemotePath "tenants/foo/timelines/bar/layerxyz", prefix it to a literal + /// key in the S3 bucket. + pub fn absolute_key(&self, key: &RemotePath) -> String { + let root = match self { + Self::Pageserver(root) => root, + Self::Safekeeper(root) => root, + }; + + let prefix = &root.prefix_in_bucket; + if prefix.ends_with('/') { + format!("{prefix}{key}") + } else { + format!("{prefix}/{key}") + } + } + pub fn bucket_name(&self) -> &str { match self { Self::Pageserver(root) => &root.bucket_name, @@ -194,6 +219,10 @@ impl RootTarget { } } +pub fn remote_timeline_path_id(id: &TenantShardTimelineId) -> RemotePath { + remote_timeline_path(&id.tenant_shard_id, &id.timeline_id) +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct BucketConfig { @@ -216,6 +245,14 @@ impl BucketConfig { } } +pub struct ControllerClientConfig { + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + pub controller_api: Url, + + /// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'. + pub controller_jwt: String, +} + pub struct ConsoleConfig { pub token: String, pub base_url: Url, @@ -267,7 +304,7 @@ pub fn init_logging(file_name: &str) -> Option { } } -pub async fn init_s3_client(bucket_region: Region) -> Client { +async fn init_s3_client(bucket_region: Region) -> Client { let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28()) .region(bucket_region) .load() @@ -275,6 +312,13 @@ pub async fn init_s3_client(bucket_region: Region) -> Client { Client::new(&config) } +fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str { + match node_kind { + NodeKind::Pageserver => "pageserver/v1/", + NodeKind::Safekeeper => "wal/", + } +} + async fn init_remote( bucket_config: BucketConfig, node_kind: NodeKind, @@ -282,18 +326,17 @@ async fn init_remote( let bucket_region = Region::new(bucket_config.region); let delimiter = "/".to_string(); let s3_client = Arc::new(init_s3_client(bucket_region).await); + let default_prefix = default_prefix_in_bucket(node_kind).to_string(); let s3_root = match node_kind { NodeKind::Pageserver => RootTarget::Pageserver(S3Target { bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config - .prefix_in_bucket - .unwrap_or("pageserver/v1".to_string()), + prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix), delimiter, }), NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target { bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()), + prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix), delimiter, }), }; @@ -301,6 +344,31 @@ async fn init_remote( Ok((s3_client, s3_root)) } +async fn init_remote_generic( + bucket_config: BucketConfig, + node_kind: NodeKind, +) -> anyhow::Result { + let endpoint = env::var("AWS_ENDPOINT_URL").ok(); + let default_prefix = default_prefix_in_bucket(node_kind).to_string(); + let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix)); + let storage = S3Config { + bucket_name: bucket_config.bucket, + bucket_region: bucket_config.region, + prefix_in_bucket, + endpoint, + concurrency_limit: DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT + .try_into() + .unwrap(), + max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + upload_storage_class: None, + }; + let storage_config = RemoteStorageConfig { + storage: RemoteStorageKind::AwsS3(storage), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + }; + GenericRemoteStorage::from_config(&storage_config).await +} + async fn list_objects_with_retries( s3_client: &Client, s3_target: &S3Target, diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index d816121192..b3ed6f6451 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,11 +1,12 @@ -use anyhow::bail; +use anyhow::{anyhow, bail}; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; -use storage_scrubber::find_large_objects; +use reqwest::Url; use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use storage_scrubber::pageserver_physical_gc::GcMode; use storage_scrubber::scan_pageserver_metadata::scan_metadata; use storage_scrubber::tenant_snapshot::SnapshotDownloader; +use storage_scrubber::{find_large_objects, ControllerClientConfig}; use storage_scrubber::{ init_logging, pageserver_physical_gc::pageserver_physical_gc, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind, @@ -24,6 +25,14 @@ struct Cli { #[arg(short, long, default_value_t = false)] delete: bool, + + #[arg(long)] + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + controller_api: Option, + + #[arg(long)] + /// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'. + controller_jwt: Option, } #[derive(Subcommand, Debug)] @@ -204,8 +213,37 @@ async fn main() -> anyhow::Result<()> { min_age, mode, } => { - let summary = - pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?; + let controller_client_conf = cli.controller_api.map(|controller_api| { + ControllerClientConfig { + controller_api, + // Default to no key: this is a convenience when working in a development environment + controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()), + } + }); + + match (&controller_client_conf, mode) { + (Some(_), _) => { + // Any mode may run when controller API is set + } + (None, GcMode::Full) => { + // The part of physical GC where we erase ancestor layers cannot be done safely without + // confirming the most recent complete shard split with the controller. Refuse to run, rather + // than doing it unsafely. + return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run")); + } + (None, GcMode::DryRun | GcMode::IndicesOnly) => { + // These GcModes do not require the controller to run. + } + } + + let summary = pageserver_physical_gc( + bucket_config, + controller_client_conf, + tenant_ids, + min_age.into(), + mode, + ) + .await?; println!("{}", serde_json::to_string(&summary).unwrap()); Ok(()) } diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index fb8fbc1635..e977fd49f7 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -1,22 +1,50 @@ -use std::time::{Duration, UNIX_EPOCH}; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; use crate::checks::{list_timeline_blobs, BlobDataParseResult}; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; +use crate::{ + init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId, +}; use aws_sdk_s3::Client; use futures_util::{StreamExt, TryStreamExt}; -use pageserver::tenant::remote_timeline_client::parse_remote_index_path; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; -use pageserver_api::shard::TenantShardId; +use pageserver_api::controller_api::TenantDescribeResponse; +use pageserver_api::shard::{ShardIndex, TenantShardId}; use remote_storage::RemotePath; +use reqwest::Method; use serde::Serialize; +use storage_controller_client::control_api; use tracing::{info_span, Instrument}; use utils::generation::Generation; +use utils::id::{TenantId, TenantTimelineId}; #[derive(Serialize, Default)] pub struct GcSummary { indices_deleted: usize, remote_storage_errors: usize, + controller_api_errors: usize, + ancestor_layers_deleted: usize, +} + +impl GcSummary { + fn merge(&mut self, other: Self) { + let Self { + indices_deleted, + remote_storage_errors, + ancestor_layers_deleted, + controller_api_errors, + } = other; + + self.indices_deleted += indices_deleted; + self.remote_storage_errors += remote_storage_errors; + self.ancestor_layers_deleted += ancestor_layers_deleted; + self.controller_api_errors += controller_api_errors; + } } #[derive(clap::ValueEnum, Debug, Clone, Copy)] @@ -26,9 +54,9 @@ pub enum GcMode { // Enable only removing old-generation indices IndicesOnly, + // Enable all forms of GC - // TODO: this will be used when shard split ancestor layer deletion is added - // All, + Full, } impl std::fmt::Display for GcMode { @@ -36,10 +64,232 @@ impl std::fmt::Display for GcMode { match self { GcMode::DryRun => write!(f, "dry-run"), GcMode::IndicesOnly => write!(f, "indices-only"), + GcMode::Full => write!(f, "full"), } } } +mod refs { + use super::*; + // Map of cross-shard layer references, giving a refcount for each layer in each shard that is referenced by some other + // shard in the same tenant. This is sparse! The vast majority of timelines will have no cross-shard refs, and those that + // do have cross shard refs should eventually drop most of them via compaction. + // + // In our inner map type, the TTID in the key is shard-agnostic, and the ShardIndex in the value refers to the _ancestor + // which is is referenced_. + #[derive(Default)] + pub(super) struct AncestorRefs( + BTreeMap>, + ); + + impl AncestorRefs { + /// Insert references for layers discovered in a particular shard-timeline that refer to an ancestral shard-timeline. + pub(super) fn update( + &mut self, + ttid: TenantShardTimelineId, + layers: Vec<(LayerName, LayerFileMetadata)>, + ) { + let ttid_refs = self.0.entry(ttid.as_tenant_timeline_id()).or_default(); + for (layer_name, layer_metadata) in layers { + // Increment refcount of this layer in the ancestor shard + *(ttid_refs + .entry((layer_metadata.shard, layer_name)) + .or_default()) += 1; + } + } + + /// For a particular TTID, return the map of all ancestor layers referenced by a descendent to their refcount + /// + /// The `ShardIndex` in the result's key is the index of the _ancestor_, not the descendent. + pub(super) fn get_ttid_refcounts( + &self, + ttid: &TenantTimelineId, + ) -> Option<&HashMap<(ShardIndex, LayerName), usize>> { + self.0.get(ttid) + } + } +} + +use refs::AncestorRefs; + +// As we see shards for a tenant, acccumulate knowledge needed for cross-shard GC: +// - Are there any ancestor shards? +// - Are there any refs to ancestor shards' layers? +#[derive(Default)] +struct TenantRefAccumulator { + shards_seen: HashMap>, + + // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to + ancestor_ref_shards: AncestorRefs, +} + +impl TenantRefAccumulator { + fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) { + let this_shard_idx = ttid.tenant_shard_id.to_index(); + (*self + .shards_seen + .entry(ttid.tenant_shard_id.tenant_id) + .or_default()) + .push(this_shard_idx); + + let mut ancestor_refs = Vec::new(); + for (layer_name, layer_metadata) in &index_part.layer_metadata { + if layer_metadata.shard != this_shard_idx { + // This is a reference from this shard to a layer in an ancestor shard: we must track this + // as a marker to not GC this layer from the parent. + ancestor_refs.push((layer_name.clone(), layer_metadata.clone())); + } + } + + if !ancestor_refs.is_empty() { + tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len()); + self.ancestor_ref_shards.update(ttid, ancestor_refs); + } + } + + /// Consume Self and return a vector of ancestor tenant shards that should be GC'd, and map of referenced ancestor layers to preserve + async fn into_gc_ancestors( + self, + controller_client: &control_api::Client, + summary: &mut GcSummary, + ) -> (Vec, AncestorRefs) { + let mut ancestors_to_gc = Vec::new(); + for (tenant_id, mut shard_indices) in self.shards_seen { + // Find the highest shard count + let latest_count = shard_indices + .iter() + .map(|i| i.shard_count) + .max() + .expect("Always at least one shard"); + + let (mut latest_shards, ancestor_shards) = { + let at = + itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count); + (shard_indices[0..at].to_owned(), &shard_indices[at..]) + }; + // Sort shards, as we will later compare them with a sorted list from the controller + latest_shards.sort(); + + // Check that we have a complete view of the latest shard count: this should always be the case unless we happened + // to scan the S3 bucket halfway through a shard split. + if latest_shards.len() != latest_count.count() as usize { + // This should be extremely rare, so we warn on it. + tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count); + continue; + } + + // Check if we have any non-latest-count shards + if ancestor_shards.is_empty() { + tracing::debug!(%tenant_id, "No ancestor shards to clean up"); + continue; + } + + // Based on S3 view, this tenant looks like it might have some ancestor shard work to do. We + // must only do this work if the tenant is not currently being split: otherwise, it is not safe + // to GC ancestors, because if the split fails then the controller will try to attach ancestor + // shards again. + match controller_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await + { + Err(e) => { + // We were not able to learn the latest shard split state from the controller, so we will not + // do ancestor GC on this tenant. + tracing::warn!(%tenant_id, "Failed to query storage controller, will not do ancestor GC: {e}"); + summary.controller_api_errors += 1; + continue; + } + Ok(desc) => { + // We expect to see that the latest shard count matches the one we saw in S3, and that none + // of the shards indicate splitting in progress. + + let controller_indices: Vec = desc + .shards + .iter() + .map(|s| s.tenant_shard_id.to_index()) + .collect(); + if controller_indices != latest_shards { + tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})"); + continue; + } + + if desc.shards.iter().any(|s| s.is_splitting) { + tracing::info!(%tenant_id, "One or more shards is currently splitting"); + continue; + } + + // This shouldn't be too noisy, because we only log this for tenants that have some ancestral refs. + tracing::info!(%tenant_id, "Validated state with controller: {desc:?}"); + } + } + + // GC ancestor shards + for ancestor_shard in ancestor_shards.iter().map(|idx| TenantShardId { + tenant_id, + shard_count: idx.shard_count, + shard_number: idx.shard_number, + }) { + ancestors_to_gc.push(ancestor_shard); + } + } + + (ancestors_to_gc, self.ancestor_ref_shards) + } +} + +async fn is_old_enough( + s3_client: &Client, + bucket_config: &BucketConfig, + min_age: &Duration, + key: &str, + summary: &mut GcSummary, +) -> bool { + // Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident + // it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects. + let age: Duration = match s3_client + .head_object() + .bucket(&bucket_config.bucket) + .key(key) + .send() + .await + { + Ok(response) => match response.last_modified { + None => { + tracing::warn!("Missing last_modified"); + summary.remote_storage_errors += 1; + return false; + } + Some(last_modified) => match SystemTime::try_from(last_modified).map(|t| t.elapsed()) { + Ok(Ok(e)) => e, + Err(_) | Ok(Err(_)) => { + tracing::warn!("Bad last_modified time: {last_modified:?}"); + return false; + } + }, + }, + Err(e) => { + tracing::warn!("Failed to HEAD {key}: {e}"); + summary.remote_storage_errors += 1; + return false; + } + }; + let old_enough = &age > min_age; + + if !old_enough { + tracing::info!( + "Skipping young object {} < {}", + humantime::format_duration(age), + humantime::format_duration(*min_age) + ); + } + + old_enough +} + async fn maybe_delete_index( s3_client: &Client, bucket_config: &BucketConfig, @@ -79,45 +329,7 @@ async fn maybe_delete_index( return; } - // Validation: we will only delete indices after one week, so that during incidents we will have - // easy access to recent indices. - let age: Duration = match s3_client - .head_object() - .bucket(&bucket_config.bucket) - .key(key) - .send() - .await - { - Ok(response) => match response.last_modified { - None => { - tracing::warn!("Missing last_modified"); - summary.remote_storage_errors += 1; - return; - } - Some(last_modified) => { - let last_modified = - UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64()); - match last_modified.elapsed() { - Ok(e) => e, - Err(_) => { - tracing::warn!("Bad last_modified time: {last_modified:?}"); - return; - } - } - } - }, - Err(e) => { - tracing::warn!("Failed to HEAD {key}: {e}"); - summary.remote_storage_errors += 1; - return; - } - }; - if &age < min_age { - tracing::info!( - "Skipping young object {} < {}", - age.as_secs_f64(), - min_age.as_secs_f64() - ); + if !is_old_enough(s3_client, bucket_config, min_age, key, summary).await { return; } @@ -145,6 +357,108 @@ async fn maybe_delete_index( } } +#[allow(clippy::too_many_arguments)] +async fn gc_ancestor( + s3_client: &Client, + bucket_config: &BucketConfig, + root_target: &RootTarget, + min_age: &Duration, + ancestor: TenantShardId, + refs: &AncestorRefs, + mode: GcMode, + summary: &mut GcSummary, +) -> anyhow::Result<()> { + // Scan timelines in the ancestor + let timelines = stream_tenant_timelines(s3_client, root_target, ancestor).await?; + let mut timelines = std::pin::pin!(timelines); + + // Build a list of keys to retain + + while let Some(ttid) = timelines.next().await { + let ttid = ttid?; + + let data = list_timeline_blobs(s3_client, ttid, root_target).await?; + + let s3_layers = match data.blob_data { + BlobDataParseResult::Parsed { + index_part: _, + index_part_generation: _, + s3_layers, + } => s3_layers, + BlobDataParseResult::Relic => { + // Post-deletion tenant location: don't try and GC it. + continue; + } + BlobDataParseResult::Incorrect(reasons) => { + // Our primary purpose isn't to report on bad data, but log this rather than skipping silently + tracing::warn!( + "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}" + ); + continue; + } + }; + + let ttid_refs = refs.get_ttid_refcounts(&ttid.as_tenant_timeline_id()); + let ancestor_shard_index = ttid.tenant_shard_id.to_index(); + + for (layer_name, layer_gen) in s3_layers { + let ref_count = ttid_refs + .and_then(|m| m.get(&(ancestor_shard_index, layer_name.clone()))) + .copied() + .unwrap_or(0); + + if ref_count > 0 { + tracing::debug!(%ttid, "Ancestor layer {layer_name} has {ref_count} refs"); + continue; + } + + tracing::info!(%ttid, "Ancestor layer {layer_name} is not referenced"); + + // Build the key for the layer we are considering deleting + let key = root_target.absolute_key(&remote_layer_path( + &ttid.tenant_shard_id.tenant_id, + &ttid.timeline_id, + ancestor_shard_index, + &layer_name, + layer_gen, + )); + + // We apply a time threshold to GCing objects that are un-referenced: this preserves our ability + // to roll back a shard split if we have to, by avoiding deleting ancestor layers right away + if !is_old_enough(s3_client, bucket_config, min_age, &key, summary).await { + continue; + } + + if !matches!(mode, GcMode::Full) { + tracing::info!("Dry run: would delete key {key}"); + continue; + } + + // All validations passed: erase the object + match s3_client + .delete_object() + .bucket(&bucket_config.bucket) + .key(&key) + .send() + .await + { + Ok(_) => { + tracing::info!("Successfully deleted unreferenced ancestor layer {key}"); + summary.ancestor_layers_deleted += 1; + } + Err(e) => { + tracing::warn!("Failed to delete layer {key}: {e}"); + summary.remote_storage_errors += 1; + } + } + } + + // TODO: if all the layers are gone, clean up the whole timeline dir (remove index) + } + + Ok(()) +} + /// Physical garbage collection: removing unused S3 objects. This is distinct from the garbage collection /// done inside the pageserver, which operates at a higher level (keys, layers). This type of garbage collection /// is about removing: @@ -156,22 +470,26 @@ async fn maybe_delete_index( /// make sure that object listings don't get slowed down by large numbers of garbage objects. pub async fn pageserver_physical_gc( bucket_config: BucketConfig, - tenant_ids: Vec, + controller_client_conf: Option, + tenant_shard_ids: Vec, min_age: Duration, mode: GcMode, ) -> anyhow::Result { let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; - let tenants = if tenant_ids.is_empty() { + let tenants = if tenant_shard_ids.is_empty() { futures::future::Either::Left(stream_tenants(&s3_client, &target)) } else { - futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok))) + futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok))) }; // How many tenants to process in parallel. We need to be mindful of pageservers // accessing the same per tenant prefixes, so use a lower setting than pageservers. const CONCURRENCY: usize = 32; + // Accumulate information about each tenant for cross-shard GC step we'll do at the end + let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default())); + // Generate a stream of TenantTimelineId let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t)); let timelines = timelines.try_buffered(CONCURRENCY); @@ -185,16 +503,17 @@ pub async fn pageserver_physical_gc( target: &RootTarget, mode: GcMode, ttid: TenantShardTimelineId, + accumulator: &Arc>, ) -> anyhow::Result { let mut summary = GcSummary::default(); let data = list_timeline_blobs(s3_client, ttid, target).await?; - let (latest_gen, candidates) = match &data.blob_data { + let (index_part, latest_gen, candidates) = match &data.blob_data { BlobDataParseResult::Parsed { - index_part: _index_part, + index_part, index_part_generation, s3_layers: _s3_layers, - } => (*index_part_generation, data.unused_index_keys), + } => (index_part, *index_part_generation, data.unused_index_keys), BlobDataParseResult::Relic => { // Post-deletion tenant location: don't try and GC it. return Ok(summary); @@ -206,6 +525,8 @@ pub async fn pageserver_physical_gc( } }; + accumulator.lock().unwrap().update(ttid, index_part); + for key in candidates { maybe_delete_index( s3_client, @@ -222,17 +543,61 @@ pub async fn pageserver_physical_gc( Ok(summary) } - let timelines = timelines - .map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid)); - let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); let mut summary = GcSummary::default(); - while let Some(i) = timelines.next().await { - let tl_summary = i?; + // Drain futures for per-shard GC, populating accumulator as a side effect + { + let timelines = timelines.map_ok(|ttid| { + gc_timeline( + &s3_client, + &bucket_config, + &min_age, + &target, + mode, + ttid, + &accumulator, + ) + }); + let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); - summary.indices_deleted += tl_summary.indices_deleted; - summary.remote_storage_errors += tl_summary.remote_storage_errors; + while let Some(i) = timelines.next().await { + summary.merge(i?); + } + } + + // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC + let Some(controller_client) = controller_client_conf.as_ref().map(|c| { + let ControllerClientConfig { + controller_api, + controller_jwt, + } = c; + control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone())) + }) else { + tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified"); + return Ok(summary); + }; + + let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator) + .unwrap() + .into_inner() + .unwrap() + .into_gc_ancestors(&controller_client, &mut summary) + .await; + + for ancestor_shard in ancestor_shards { + gc_ancestor( + &s3_client, + &bucket_config, + &target, + &min_age, + ancestor_shard, + &ancestor_refs, + mode, + &mut summary, + ) + .instrument(info_span!("gc_ancestor", %ancestor_shard)) + .await?; } Ok(summary) diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index df4f29acf7..fbd60f93bb 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -8,12 +8,11 @@ use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use aws_sdk_s3::Client; use futures_util::{StreamExt, TryStreamExt}; -use histogram::Histogram; use pageserver::tenant::remote_timeline_client::remote_layer_path; -use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; use serde::Serialize; use utils::id::TenantId; +use utils::shard::ShardCount; #[derive(Serialize)] pub struct MetadataSummary { @@ -24,66 +23,6 @@ pub struct MetadataSummary { with_warnings: HashSet, with_orphans: HashSet, indices_by_version: HashMap, - - layer_count: MinMaxHisto, - timeline_size_bytes: MinMaxHisto, - layer_size_bytes: MinMaxHisto, -} - -/// A histogram plus minimum and maximum tracking -#[derive(Serialize)] -struct MinMaxHisto { - #[serde(skip)] - histo: Histogram, - min: u64, - max: u64, -} - -impl MinMaxHisto { - fn new() -> Self { - Self { - histo: histogram::Histogram::builder() - .build() - .expect("Bad histogram params"), - min: u64::MAX, - max: 0, - } - } - - fn sample(&mut self, v: u64) -> Result<(), histogram::Error> { - self.min = std::cmp::min(self.min, v); - self.max = std::cmp::max(self.max, v); - let r = self.histo.increment(v, 1); - - if r.is_err() { - tracing::warn!("Bad histogram sample: {v}"); - } - - r - } - - fn oneline(&self) -> String { - let percentiles = match self.histo.percentiles(&[1.0, 10.0, 50.0, 90.0, 99.0]) { - Ok(p) => p, - Err(e) => return format!("No data: {}", e), - }; - - let percentiles: Vec = percentiles - .iter() - .map(|p| p.bucket().low() + p.bucket().high() / 2) - .collect(); - - format!( - "min {}, 1% {}, 10% {}, 50% {}, 90% {}, 99% {}, max {}", - self.min, - percentiles[0], - percentiles[1], - percentiles[2], - percentiles[3], - percentiles[4], - self.max, - ) - } } impl MetadataSummary { @@ -96,25 +35,9 @@ impl MetadataSummary { with_warnings: HashSet::new(), with_orphans: HashSet::new(), indices_by_version: HashMap::new(), - layer_count: MinMaxHisto::new(), - timeline_size_bytes: MinMaxHisto::new(), - layer_size_bytes: MinMaxHisto::new(), } } - fn update_histograms(&mut self, index_part: &IndexPart) -> Result<(), histogram::Error> { - self.layer_count - .sample(index_part.layer_metadata.len() as u64)?; - let mut total_size: u64 = 0; - for meta in index_part.layer_metadata.values() { - total_size += meta.file_size; - self.layer_size_bytes.sample(meta.file_size)?; - } - self.timeline_size_bytes.sample(total_size)?; - - Ok(()) - } - fn update_data(&mut self, data: &S3TimelineBlobData) { self.timeline_shard_count += 1; if let BlobDataParseResult::Parsed { @@ -127,14 +50,6 @@ impl MetadataSummary { .indices_by_version .entry(index_part.version()) .or_insert(0) += 1; - - if let Err(e) = self.update_histograms(index_part) { - // Value out of range? Warn that the results are untrustworthy - tracing::warn!( - "Error updating histograms, summary stats may be wrong: {}", - e - ); - } } } @@ -169,9 +84,6 @@ With errors: {} With warnings: {} With orphan layers: {} Index versions: {version_summary} -Timeline size bytes: {} -Layer size bytes: {} -Timeline layer count: {} ", self.tenant_count, self.timeline_count, @@ -179,9 +91,6 @@ Timeline layer count: {} self.with_errors.len(), self.with_warnings.len(), self.with_orphans.len(), - self.timeline_size_bytes.oneline(), - self.layer_size_bytes.oneline(), - self.layer_count.oneline(), ) } @@ -235,33 +144,60 @@ pub async fn scan_metadata( let mut tenant_objects = TenantObjectListing::default(); let mut tenant_timeline_results = Vec::new(); - fn analyze_tenant( + async fn analyze_tenant( + s3_client: &Client, + target: &RootTarget, tenant_id: TenantId, summary: &mut MetadataSummary, mut tenant_objects: TenantObjectListing, timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>, + highest_shard_count: ShardCount, ) { summary.tenant_count += 1; let mut timeline_ids = HashSet::new(); let mut timeline_generations = HashMap::new(); for (ttid, data) in timelines { - timeline_ids.insert(ttid.timeline_id); - // Stash the generation of each timeline, for later use identifying orphan layers - if let BlobDataParseResult::Parsed { - index_part: _index_part, - index_part_generation, - s3_layers: _s3_layers, - } = &data.blob_data - { - timeline_generations.insert(ttid, *index_part_generation); - } + if ttid.tenant_shard_id.shard_count == highest_shard_count { + // Only analyze `TenantShardId`s with highest shard count. - // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects` - // reference counts for layers across the tenant. - let analysis = - branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data)); - summary.update_analysis(&ttid, &analysis); + // Stash the generation of each timeline, for later use identifying orphan layers + if let BlobDataParseResult::Parsed { + index_part, + index_part_generation, + s3_layers: _s3_layers, + } = &data.blob_data + { + if index_part.deleted_at.is_some() { + // skip deleted timeline. + tracing::info!("Skip analysis of {} b/c timeline is already deleted", ttid); + continue; + } + timeline_generations.insert(ttid, *index_part_generation); + } + + // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects` + // reference counts for layers across the tenant. + let analysis = branch_cleanup_and_check_errors( + s3_client, + target, + &ttid, + &mut tenant_objects, + None, + None, + Some(data), + ) + .await; + summary.update_analysis(&ttid, &analysis); + + timeline_ids.insert(ttid.timeline_id); + } else { + tracing::info!( + "Skip analysis of {} b/c a lower shard count than {}", + ttid, + highest_shard_count.0, + ); + } } summary.timeline_count += timeline_ids.len(); @@ -309,18 +245,35 @@ pub async fn scan_metadata( // all results for the same tenant will be adjacent. We accumulate these, // and then call `analyze_tenant` to flush, when we see the next tenant ID. let mut summary = MetadataSummary::new(); + let mut highest_shard_count = ShardCount::MIN; while let Some(i) = timelines.next().await { let (ttid, data) = i?; summary.update_data(&data); match tenant_id { - None => tenant_id = Some(ttid.tenant_shard_id.tenant_id), + None => { + tenant_id = Some(ttid.tenant_shard_id.tenant_id); + highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count); + } Some(prev_tenant_id) => { if prev_tenant_id != ttid.tenant_shard_id.tenant_id { + // New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results let tenant_objects = std::mem::take(&mut tenant_objects); let timelines = std::mem::take(&mut tenant_timeline_results); - analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines); + analyze_tenant( + &s3_client, + &target, + prev_tenant_id, + &mut summary, + tenant_objects, + timelines, + highest_shard_count, + ) + .await; tenant_id = Some(ttid.tenant_shard_id.tenant_id); + highest_shard_count = ttid.tenant_shard_id.shard_count; + } else { + highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count); } } } @@ -338,11 +291,15 @@ pub async fn scan_metadata( if !tenant_timeline_results.is_empty() { analyze_tenant( + &s3_client, + &target, tenant_id.expect("Must be set if results are present"), &mut summary, tenant_objects, tenant_timeline_results, - ); + highest_shard_count, + ) + .await; } Ok(summary) diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 147264762c..b63dfd4e47 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -143,6 +143,9 @@ class TimelineId(Id): def __repr__(self) -> str: return f'TimelineId("{self.id.hex()}")' + def __str__(self) -> str: + return self.id.hex() + # Workaround for compat with python 3.9, which does not have `typing.Self` TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId") diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 429b6af548..08215438e1 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -109,8 +109,6 @@ class NeonCompare(PgCompare): # Create tenant tenant_conf: Dict[str, str] = {} - if False: # TODO add pytest setting for this - tenant_conf["trace_read_requests"] = "true" self.tenant, _ = self.env.neon_cli.create_tenant(conf=tenant_conf) # Create timeline diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c019cbbc77..4836d42db5 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -146,6 +146,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_smgr_query_seconds_sum", "pageserver_archive_size", "pageserver_pitr_history_size", + "pageserver_layer_bytes", + "pageserver_layer_count", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total", diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 39baf5fab6..658ed119a1 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -261,3 +261,47 @@ class NeonAPI: if op["status"] in {"scheduling", "running", "cancelling"}: has_running = True time.sleep(0.5) + + +class NeonApiEndpoint: + def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]): + self.neon_api = neon_api + if project_id is None: + project = neon_api.create_project(pg_version) + neon_api.wait_for_operation_to_finish(project["project"]["id"]) + self.project_id = project["project"]["id"] + self.endpoint_id = project["endpoints"][0]["id"] + self.connstr = project["connection_uris"][0]["connection_uri"] + self.pgbench_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + self.is_new = True + else: + project = neon_api.get_project_details(project_id) + if int(project["project"]["pg_version"]) != int(pg_version): + raise Exception( + f"A project with the provided ID exists, but it's not of the specified version (expected {pg_version}, got {project['project']['pg_version']})" + ) + self.project_id = project_id + eps = neon_api.get_endpoints(project_id)["endpoints"] + self.endpoint_id = eps[0]["id"] + self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[ + "uri" + ] + pw = self.connstr.split("@")[0].split(":")[-1] + self.pgbench_env = { + "PGHOST": eps[0]["host"], + "PGDATABASE": "neondb", + "PGUSER": "neondb_owner", + "PGPASSWORD": pw, + } + self.is_new = False + + def restart(self): + self.neon_api.restart_endpoint(self.project_id, self.endpoint_id) + self.neon_api.wait_for_operation_to_finish(self.project_id) + + def get_synthetic_storage_size(self) -> int: + return int( + self.neon_api.get_project_details(self.project_id)["project"]["synthetic_storage_size"] + ) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5ca31644a9..9e39457c06 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -31,6 +31,7 @@ import backoff import httpx import jwt import psycopg2 +import psycopg2.sql import pytest import requests import toml @@ -87,7 +88,7 @@ from fixtures.utils import ( ) from fixtures.utils import AuxFileStore as AuxFileStore # reexport -from .neon_api import NeonAPI +from .neon_api import NeonAPI, NeonApiEndpoint """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -727,8 +728,30 @@ class NeonEnvBuilder: self.repo_dir / "local_fs_remote_storage", ) - if (attachments_json := Path(repo_dir / "attachments.json")).exists(): - shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name) + # restore storage controller (the db is small, don't bother with overlayfs) + storcon_db_from_dir = repo_dir / "storage_controller_db" + storcon_db_to_dir = self.repo_dir / "storage_controller_db" + log.info(f"Copying storage_controller_db from {storcon_db_from_dir} to {storcon_db_to_dir}") + assert storcon_db_from_dir.is_dir() + assert not storcon_db_to_dir.exists() + + def ignore_postgres_log(path: str, _names): + if Path(path) == storcon_db_from_dir: + return {"postgres.log"} + return set() + + shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log) + assert not (storcon_db_to_dir / "postgres.log").exists() + # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it. + # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller + # will currently reject re-attach requests from them because the NodeMetadata isn't identical. + # So, from_repo_dir patches up the the storcon database. + patch_script_path = self.repo_dir / "storage_controller_db.startup.sql" + assert not patch_script_path.exists() + patch_script = "" + for ps in self.env.pageservers: + patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';" + patch_script_path.write_text(patch_script) # Update the config with info about tenants and timelines with (self.repo_dir / "config").open("r") as f: @@ -974,7 +997,7 @@ class NeonEnvBuilder: if self.scrub_on_exit: try: - StorageScrubber(self).scan_metadata() + self.env.storage_scrubber.scan_metadata() except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -1135,6 +1158,7 @@ class NeonEnv: "listen_http_addr": f"localhost:{pageserver_port.http}", "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, + "image_compression": "zstd", } if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine @@ -1201,6 +1225,9 @@ class NeonEnv: ) cfg["safekeepers"].append(sk_cfg) + # Scrubber instance for tests that use it, and for use during teardown checks + self.storage_scrubber = StorageScrubber(self, log_dir=config.test_output_dir) + log.info(f"Config: {cfg}") self.neon_cli.init( cfg, @@ -2287,6 +2314,14 @@ class NeonStorageController(MetricsGetter, LogUtils): headers=self.headers(TokenScope.ADMIN), ) + def node_delete(self, node_id): + log.info(f"node_delete({node_id})") + self.request( + "DELETE", + f"{self.env.storage_controller_api}/control/v1/node/{node_id}", + headers=self.headers(TokenScope.ADMIN), + ) + def node_drain(self, node_id): log.info(f"node_drain({node_id})") self.request( @@ -2392,7 +2427,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ - :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int} + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} """ response = self.request( "GET", @@ -2778,8 +2813,8 @@ class NeonPageserver(PgProtocol, LogUtils): ) return client.tenant_attach( tenant_id, + generation, config, - generation=generation, ) def tenant_detach(self, tenant_id: TenantId): @@ -3150,6 +3185,18 @@ class RemotePostgres(PgProtocol): pass +@pytest.fixture(scope="function") +def benchmark_project_pub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint: + project_id = os.getenv("BENCHMARK_PROJECT_ID_PUB") + return NeonApiEndpoint(neon_api, pg_version, project_id) + + +@pytest.fixture(scope="function") +def benchmark_project_sub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint: + project_id = os.getenv("BENCHMARK_PROJECT_ID_SUB") + return NeonApiEndpoint(neon_api, pg_version, project_id) + + @pytest.fixture(scope="function") def remote_pg( test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion @@ -3765,12 +3812,12 @@ class Endpoint(PgProtocol, LogUtils): self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers ) - def respec(self, **kwargs): + def respec(self, **kwargs: Any) -> None: """Update the endpoint.json file used by control_plane.""" # Read config config_path = os.path.join(self.endpoint_path(), "endpoint.json") with open(config_path, "r") as f: - data_dict = json.load(f) + data_dict: dict[str, Any] = json.load(f) # Write it back updated with open(config_path, "w") as file: @@ -3778,13 +3825,13 @@ class Endpoint(PgProtocol, LogUtils): json.dump(dict(data_dict, **kwargs), file, indent=4) # Please note: Migrations only run if pg_skip_catalog_updates is false - def wait_for_migrations(self): + def wait_for_migrations(self, num_migrations: int = 10): with self.cursor() as cur: def check_migrations_done(): cur.execute("SELECT id FROM neon_migration.migration_id") - migration_id = cur.fetchall()[0][0] - assert migration_id != 0 + migration_id: int = cur.fetchall()[0][0] + assert migration_id >= num_migrations wait_until(20, 0.5, check_migrations_done) @@ -4034,6 +4081,22 @@ class Safekeeper(LogUtils): self.id = id self.running = running self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log" + + if extra_opts is None: + # Testing defaults: enable everything, and set short timeouts so that background + # work will happen during short tests. + # **Note**: Any test that explicitly sets extra_opts will not get these defaults. + extra_opts = [ + "--enable-offload", + "--delete-offloaded-wal", + "--partial-backup-timeout", + "10s", + "--control-file-save-interval", + "1s", + "--eviction-min-resident", + "10s", + ] + self.extra_opts = extra_opts def start( @@ -4205,9 +4268,9 @@ class Safekeeper(LogUtils): class StorageScrubber: - def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None): + def __init__(self, env: NeonEnv, log_dir: Path): self.env = env - self.log_dir = log_dir or env.test_output_dir + self.log_dir = log_dir def scrubber_cli(self, args: list[str], timeout) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) @@ -4224,11 +4287,14 @@ class StorageScrubber: if s3_storage.endpoint is not None: env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint}) - base_args = [str(self.env.neon_binpath / "storage_scrubber")] + base_args = [ + str(self.env.neon_binpath / "storage_scrubber"), + f"--controller-api={self.env.storage_controller_api}", + ] args = base_args + args (output_path, stdout, status_code) = subprocess_capture( - self.env.test_output_dir, + self.log_dir, args, echo_stderr=True, echo_stdout=True, @@ -4267,7 +4333,10 @@ class StorageScrubber: log.info(f"tenant-snapshot output: {stdout}") def pageserver_physical_gc( - self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None + self, + min_age_secs: int, + tenant_ids: Optional[list[TenantId]] = None, + mode: Optional[str] = None, ): args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"] @@ -4277,6 +4346,9 @@ class StorageScrubber: for tenant_id in tenant_ids: args.extend(["--tenant-id", str(tenant_id)]) + if mode is not None: + args.extend(["--mode", mode]) + stdout = self.scrubber_cli( args, timeout=30, diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index c5b09e3608..dff002bd4b 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -102,6 +102,7 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ # failing to connect to them. ".*Call to node.*management API.*failed.*receive body.*", ".*Call to node.*management API.*failed.*ReceiveBody.*", + ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", # Tests run in dev mode diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 03aee9e5c5..c6df6b5baf 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -117,6 +117,9 @@ class LayerMapInfo: def image_layers(self) -> List[HistoricLayerInfo]: return [x for x in self.historic_layers if x.kind == "Image"] + def delta_l0_layers(self) -> List[HistoricLayerInfo]: + return [x for x in self.historic_layers if x.kind == "Delta" and x.l0] + def historic_by_name(self) -> Set[str]: return set(x.layer_file_name for x in self.historic_layers) @@ -172,6 +175,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter): if auth_token is not None: self.headers["Authorization"] = f"Bearer {auth_token}" + def without_status_retrying(self) -> PageserverHttpClient: + retries = Retry( + status=0, + connect=5, + read=False, + backoff_factor=0.2, + status_forcelist=[], + allowed_methods=None, + remove_headers_on_redirect=[], + ) + + return PageserverHttpClient( + self.port, self.is_testing_enabled_or_skip, self.auth_token, retries + ) + @property def base_url(self) -> str: return f"http://localhost:{self.port}" @@ -223,8 +241,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_attach( self, tenant_id: Union[TenantId, TenantShardId], + generation: int, config: None | Dict[str, Any] = None, - generation: Optional[int] = None, ): config = config or {} @@ -644,6 +662,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): force_repartition=False, force_image_layer_creation=False, wait_until_uploaded=False, + compact: Optional[bool] = None, ): self.is_testing_enabled_or_skip() query = {} @@ -654,6 +673,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): if wait_until_uploaded: query["wait_until_uploaded"] = "true" + if compact is not None: + query["compact"] = "true" if compact else "false" + log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", @@ -814,17 +836,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, batch_size: int | None = None, - ) -> Set[TimelineId]: + **kwargs, + ) -> List[TimelineId]: params = {} if batch_size is not None: params["batch_size"] = batch_size res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", params=params, + **kwargs, ) self.verbose_error(res) json = res.json() - return set(map(TimelineId, json["reparented_timelines"])) + return list(map(TimelineId, json["reparented_timelines"])) def evict_layer( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index c437258c6f..3e0ffabf74 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -1,5 +1,4 @@ import concurrent.futures -import time from typing import Any, Callable, Dict, Tuple import fixtures.pageserver.remote_storage @@ -9,9 +8,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) -from fixtures.pageserver.utils import ( - wait_until_tenant_state, -) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind @@ -46,38 +42,33 @@ def single_timeline( log.info(f"duplicating template tenant {ncopies} times in S3") tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) + # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. + # However, on-demand downloads are quite slow ATM. + # => do the on-demand downloads in Python. + log.info("python-side on-demand download the layer files into local tenant dir") + tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) + fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( + env, tenant_timelines + ) + log.info("attach duplicated tenants to pageserver") # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. # However, on-demand downloads are quite slow ATM. # => do the on-demand downloads in Python. assert ps_http.tenant_list() == [] - # make the attach fail after it created enough on-disk state to retry loading - # the tenant next startup, but before it can start background loops that would start download - ps_http.configure_failpoints(("attach-before-activate", "return")) - env.pageserver.allowed_errors.append( - ".*attach failed, setting tenant state to Broken: attach-before-activate.*" - ) - def attach_broken(tenant): + def attach(tenant): env.pageserver.tenant_attach( tenant, config=template_config.copy(), generation=100, override_storage_controller_generation=True, ) - time.sleep(0.1) - wait_until_tenant_state(ps_http, tenant, "Broken", 10) with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: - executor.map(attach_broken, tenants) + executor.map(attach, tenants) - env.pageserver.stop( - immediate=True - ) # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout - tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) - log.info("python-side on-demand download the layer files into local tenant dir") - fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( - env, tenant_timelines - ) + # Benchmarks will start the pageserver explicitly themselves + env.pageserver.stop() return env diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 6f6526d3fc..0f2a997b1e 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -12,8 +12,9 @@ import boto3 import toml from mypy_boto3_s3 import S3Client -from fixtures.common_types import TenantId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log +from fixtures.pageserver.common_types import IndexPartDump TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @@ -265,9 +266,38 @@ class S3Storage: def tenants_path(self) -> str: return f"{self.prefix_in_bucket}/tenants" - def tenant_path(self, tenant_id: TenantId) -> str: + def tenant_path(self, tenant_id: Union[TenantShardId, TenantId]) -> str: return f"{self.tenants_path()}/{tenant_id}" + def timeline_path( + self, tenant_id: Union[TenantShardId, TenantId], timeline_id: TimelineId + ) -> str: + return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + + def get_latest_index_key(self, index_keys: List[str]) -> str: + """ + Gets the latest index file key. + + @param index_keys: A list of index keys of different generations. + """ + + def parse_gen(index_key: str) -> int: + parts = index_key.split("index_part.json-") + return int(parts[-1], base=16) if len(parts) == 2 else -1 + + return max(index_keys, key=parse_gen) + + def download_index_part(self, index_key: str) -> IndexPartDump: + """ + Downloads the index content from remote storage. + + @param index_key: index key in remote storage. + """ + response = self.client.get_object(Bucket=self.bucket_name, Key=index_key) + body = response["Body"].read().decode("utf-8") + log.info(f"index_part.json: {body}") + return IndexPartDump.from_json(json.loads(body)) + def heatmap_key(self, tenant_id: TenantId) -> str: return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 11e6fef28f..a51b89744b 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -1,6 +1,5 @@ import json -import re -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Union import pytest @@ -8,6 +7,7 @@ import requests from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log +from fixtures.metrics import Metrics, MetricsGetter, parse_metrics # Walreceiver as returned by sk's timeline status endpoint. @@ -31,15 +31,26 @@ class SafekeeperTimelineStatus: walreceivers: List[Walreceiver] -@dataclass -class SafekeeperMetrics: +class SafekeeperMetrics(Metrics): + # Helpers to get metrics from tests without hardcoding the metric names there. # These are metrics from Prometheus which uses float64 internally. # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + + def __init__(self, m: Metrics): + self.metrics = m.metrics + + def flush_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId): + return self.query_one( + "safekeeper_flush_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)} + ).value + + def commit_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId): + return self.query_one( + "safekeeper_commit_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)} + ).value -class SafekeeperHttpClient(requests.Session): +class SafekeeperHttpClient(requests.Session, MetricsGetter): HTTPError = requests.HTTPError def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): @@ -209,28 +220,11 @@ class SafekeeperHttpClient(requests.Session): return res_json def get_metrics_str(self) -> str: + """You probably want to use get_metrics() instead.""" request_result = self.get(f"http://localhost:{self.port}/metrics") request_result.raise_for_status() return request_result.text def get_metrics(self) -> SafekeeperMetrics: - all_metrics_text = self.get_metrics_str() - - metrics = SafekeeperMetrics() - for match in re.finditer( - r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( - match.group(3) - ) - for match in re.finditer( - r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.commit_lsn_inexact[ - (TenantId(match.group(1)), TimelineId(match.group(2))) - ] = int(match.group(3)) - return metrics + res = self.get_metrics_str() + return SafekeeperMetrics(parse_metrics(res)) diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index b41ae60197..3258d4dcfa 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -17,13 +17,11 @@ from performance.pageserver.util import ( @pytest.mark.parametrize("duration", [30]) @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) @pytest.mark.parametrize("n_tenants", [10]) -@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"]) @pytest.mark.timeout(1000) def test_basebackup_with_high_slru_count( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, - get_vectored_impl: str, n_tenants: int, pgbench_scale: int, duration: int, @@ -47,7 +45,7 @@ def test_basebackup_with_high_slru_count( max_file_descriptors = 500000 neon_env_builder.pageserver_config_override = ( f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; " - f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false" + f"get_vectored_impl='vectored'; validate_vectored_get=false" ) params.update( { diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 60861cf939..949813c984 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -255,11 +255,3 @@ def run_pagebench_benchmark( unit="ms", report=MetricReport.LOWER_IS_BETTER, ) - - env.storage_controller.allowed_errors.append( - # The test setup swaps NeonEnv instances, hence different - # pg instances are used for the storage controller db. This means - # the storage controller doesn't know about the nodes mentioned - # in attachments.json at start-up. - ".* Scheduler missing node 1", - ) diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index 326c4f5c6f..3c6f0b0131 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -2,6 +2,7 @@ from contextlib import closing import pytest from fixtures.compare_fixtures import NeonCompare +from fixtures.log_helper import log from fixtures.neon_fixtures import wait_for_last_flush_lsn @@ -56,3 +57,98 @@ def test_compaction(neon_compare: NeonCompare): pageserver_http.timeline_compact(tenant_id, timeline_id) neon_compare.report_size() + + +def test_compaction_l0_memory(neon_compare: NeonCompare): + """ + Generate a large stack of L0s pending compaction into L1s, and + measure the pageserver's peak RSS while doing so + """ + + env = neon_compare.env + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # Initially disable compaction so that we will build up a stack of L0s + "compaction_period": "0s", + "gc_period": "0s", + } + ) + neon_compare.tenant = tenant_id + neon_compare.timeline = timeline_id + + endpoint = env.endpoints.create_start( + "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"] + ) + + # Read tenant effective config and assert on checkpoint_distance and compaction_threshold, + # as we do want to test with defaults (to be same as the field), but this test's workload size makes assumptions about them. + # + # If these assertions fail, it probably means we changed the default. + tenant_conf = pageserver_http.tenant_config(tenant_id) + assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024 + assert tenant_conf.effective_config["compaction_threshold"] == 10 + + # Aim to write about 20 L0s, so that we will hit the limit on how many + # to compact at once + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + for i in range(200): + cur.execute(f"create table tbl{i} (i int, j int);") + cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);") + for j in range(100): + cur.execute(f"update tbl{i} set j = {j};") + + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + endpoint.stop() + + # Check we have generated the L0 stack we expected + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + initial_l0s = len(layers.delta_l0_layers()) + initial_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers()) + log.info(f"l0s before compaction {initial_l0s} ({initial_l0s_size})") + + def rss_hwm(): + v = pageserver_http.get_metric_value("libmetrics_maxrss_kb") + assert v is not None + assert v > 0 + return v * 1024 + + before = rss_hwm() + pageserver_http.timeline_compact(tenant_id, timeline_id) + after = rss_hwm() + + log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})") + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + final_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers()) + log.info(f"l0s after compaction {len(layers.delta_l0_layers())} ({final_l0s_size})") + + assert after > before # If we didn't use some memory the test is probably buggy + compaction_mapped_rss = after - before + + # During L0 compaction, we require as much memory as the physical size of what we compacted, and then some, + # because the key->value mapping in L0s compaction is exhaustive, non-streaming, and does not de-duplicate + # repeated references to the same key. + # + # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which + # this memory estimate can be revised far downwards to something that doesn't scale + # linearly with the layer sizes. + MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.5 + + # If we find that compaction is using more memory, this may indicate a regression + assert compaction_mapped_rss < MEMORY_ESTIMATE + + # If we find that compaction is using <0.5 the expected memory then: + # - maybe we made a big efficiency improvement, in which case update the test + # - maybe something is functionally wrong with the test and it's not driving the system as expected + assert compaction_mapped_rss > MEMORY_ESTIMATE / 2 + + # We should have compacted some but not all of the l0s, based on the limit on how much + # l0 to compact in one go + assert len(layers.delta_l0_layers()) > 0 + assert len(layers.delta_l0_layers()) < initial_l0s + + # The pageserver should have logged when it hit the compaction size limit + env.pageserver.assert_log_contains(".*hit max delta layer size limit.*") diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py index d9785dd87e..5e97c7cddf 100644 --- a/test_runner/performance/test_hot_page.py +++ b/test_runner/performance/test_hot_page.py @@ -16,20 +16,34 @@ from pytest_lazyfixture import lazy_fixture ) def test_hot_page(env: PgCompare): # Update the same page many times, then measure read performance - num_writes = 1000000 with closing(env.pg.connect()) as conn: with conn.cursor() as cur: cur.execute("drop table if exists t, f;") + num_writes = 1000000 - # Write many updates to the same row + # Use a PL/pgSQL block to perform many updates to the same row + # without depending on the latency between database client and postgres + # server + # - however a single staement should not run into a timeout so we increase it + cur.execute("SET statement_timeout = '4h';") with env.record_duration("write"): - cur.execute("create table t (i integer);") - cur.execute("insert into t values (0);") - for i in range(num_writes): - cur.execute(f"update t set i = {i};") + cur.execute( + f""" + DO $$ + BEGIN + create table t (i integer); + insert into t values (0); - # Write 3-4 MB to evict t from compute cache + FOR j IN 1..{num_writes} LOOP + update t set i = j; + END LOOP; + END $$; + """ + ) + + # Write ca 350 MB to evict t from compute shared buffers (128 MB) + # however it will still be in LFC, so I do not really understand the point of this test cur.execute("create table f (i integer);") cur.execute("insert into f values (generate_series(1,100000));") diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py index 5fcffc8afb..9a78c92ec0 100644 --- a/test_runner/performance/test_hot_table.py +++ b/test_runner/performance/test_hot_table.py @@ -16,8 +16,8 @@ from pytest_lazyfixture import lazy_fixture ) def test_hot_table(env: PgCompare): # Update a small table many times, then measure read performance - num_rows = 100000 # Slightly larger than shared buffers size TODO validate - num_writes = 1000000 + num_rows = 100000 # initial table size only about 4 MB + num_writes = 10000000 # write approximately 349 MB blocks > 128 MB shared_buffers num_reads = 10 with closing(env.pg.connect()) as conn: @@ -28,8 +28,21 @@ def test_hot_table(env: PgCompare): with env.record_duration("write"): cur.execute("create table t (i integer primary key);") cur.execute(f"insert into t values (generate_series(1,{num_rows}));") - for i in range(num_writes): - cur.execute(f"update t set i = {i + num_rows} WHERE i = {i};") + # PL/pgSQL block to perform updates (and avoid latency between client and server) + # - however a single staement should not run into a timeout so we increase it + cur.execute("SET statement_timeout = '4h';") + cur.execute( + f""" + DO $$ + DECLARE + r integer := {num_rows}; + BEGIN + FOR j IN 1..{num_writes} LOOP + UPDATE t SET i = j + r WHERE i = j; + END LOOP; + END $$; + """ + ) # Read the table with env.record_duration("read"): diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 5ab83dd31d..53bb29a659 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -1,7 +1,6 @@ from __future__ import annotations import time -import traceback from typing import TYPE_CHECKING import psycopg2 @@ -10,15 +9,12 @@ import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_api import connection_parameters_to_env from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync -from fixtures.pg_version import PgVersion if TYPE_CHECKING: from fixtures.benchmark_fixture import NeonBenchmarker - from fixtures.neon_api import NeonAPI + from fixtures.neon_api import NeonApiEndpoint from fixtures.neon_fixtures import NeonEnv, PgBin - from fixtures.pg_version import PgVersion @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2]) @@ -86,8 +82,8 @@ def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600): @pytest.mark.timeout(2 * 60 * 60) def test_subscriber_lag( pg_bin: PgBin, - neon_api: NeonAPI, - pg_version: PgVersion, + benchmark_project_pub: NeonApiEndpoint, + benchmark_project_sub: NeonApiEndpoint, zenbenchmark: NeonBenchmarker, ): """ @@ -99,125 +95,82 @@ def test_subscriber_lag( sync_interval_min = 5 pgbench_duration = f"-T{test_duration_min * 60 * 2}" - pub_project = neon_api.create_project(pg_version) - pub_project_id = pub_project["project"]["id"] - neon_api.wait_for_operation_to_finish(pub_project_id) - error_occurred = False + pub_env = benchmark_project_pub.pgbench_env + sub_env = benchmark_project_sub.pgbench_env + pub_connstr = benchmark_project_pub.connstr + sub_connstr = benchmark_project_sub.connstr + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + if benchmark_project_pub.is_new: + pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history") + + if benchmark_project_sub.is_new: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1") + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() + sub_conn.close() + + zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) try: - sub_project = neon_api.create_project(pg_version) - sub_project_id = sub_project["project"]["id"] - sub_endpoint_id = sub_project["endpoints"][0]["id"] - neon_api.wait_for_operation_to_finish(sub_project_id) + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) try: - pub_env = connection_parameters_to_env( - pub_project["connection_uris"][0]["connection_parameters"] - ) - sub_env = connection_parameters_to_env( - sub_project["connection_uris"][0]["connection_parameters"] - ) - pub_connstr = pub_project["connection_uris"][0]["connection_uri"] - sub_connstr = sub_project["connection_uris"][0]["connection_uri"] + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) - pub_conn = psycopg2.connect(pub_connstr) - sub_conn = psycopg2.connect(sub_connstr) - pub_conn.autocommit = True - sub_conn.autocommit = True - with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - sub_cur.execute("truncate table pgbench_accounts") - sub_cur.execute("truncate table pgbench_history") + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + sub_workload.terminate() + benchmark_project_sub.restart() - pub_cur.execute( - "create publication pub1 for table pgbench_accounts, pgbench_history" - ) - sub_cur.execute( - f"create subscription sub1 connection '{pub_connstr}' publication pub1" - ) - - initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) - pub_conn.close() - sub_conn.close() - - zenbenchmark.record( - "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER - ) - - pub_workload = pg_bin.run_nonblocking( - ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env - ) - try: sub_workload = pg_bin.run_nonblocking( ["pgbench", "-c10", pgbench_duration, "-S"], env=sub_env, ) - try: - start = time.time() - while time.time() - start < test_duration_min * 60: - time.sleep(sync_interval_min * 60) - check_pgbench_still_running(pub_workload, "pub") - check_pgbench_still_running(sub_workload, "sub") - with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( - sub_connstr - ) as sub_conn: - with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - lag = measure_logical_replication_lag(sub_cur, pub_cur) - - log.info(f"Replica lagged behind master by {lag} seconds") - zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) - sub_workload.terminate() - neon_api.restart_endpoint( - sub_project_id, - sub_endpoint_id, - ) - neon_api.wait_for_operation_to_finish(sub_project_id) - sub_workload = pg_bin.run_nonblocking( - ["pgbench", "-c10", pgbench_duration, "-S"], - env=sub_env, - ) - - # Measure storage to make sure replication information isn't bloating storage - sub_storage = neon_api.get_project_details(sub_project_id)["project"][ - "synthetic_storage_size" - ] - pub_storage = neon_api.get_project_details(pub_project_id)["project"][ - "synthetic_storage_size" - ] - zenbenchmark.record( - "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER - ) - zenbenchmark.record( - "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER - ) - - finally: - sub_workload.terminate() - finally: - pub_workload.terminate() - except Exception as e: - error_occurred = True - log.error(f"Caught exception {e}") - log.error(traceback.format_exc()) + # Measure storage to make sure replication information isn't bloating storage + sub_storage = benchmark_project_sub.get_synthetic_storage_size() + pub_storage = benchmark_project_pub.get_synthetic_storage_size() + zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER) finally: - if not error_occurred: - neon_api.delete_project(sub_project_id) - except Exception as e: - error_occurred = True - log.error(f"Caught exception {e}") - log.error(traceback.format_exc()) + sub_workload.terminate() finally: - assert not error_occurred - neon_api.delete_project(pub_project_id) + pub_workload.terminate() @pytest.mark.remote_cluster @pytest.mark.timeout(2 * 60 * 60) def test_publisher_restart( pg_bin: PgBin, - neon_api: NeonAPI, - pg_version: PgVersion, + benchmark_project_pub: NeonApiEndpoint, + benchmark_project_sub: NeonApiEndpoint, zenbenchmark: NeonBenchmarker, ): """ @@ -229,114 +182,70 @@ def test_publisher_restart( sync_interval_min = 5 pgbench_duration = f"-T{test_duration_min * 60 * 2}" - pub_project = neon_api.create_project(pg_version) - pub_project_id = pub_project["project"]["id"] - pub_endpoint_id = pub_project["endpoints"][0]["id"] - neon_api.wait_for_operation_to_finish(pub_project_id) - error_occurred = False + pub_env = benchmark_project_pub.pgbench_env + sub_env = benchmark_project_sub.pgbench_env + pub_connstr = benchmark_project_pub.connstr + sub_connstr = benchmark_project_sub.connstr + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + if benchmark_project_pub.is_new: + pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history") + + if benchmark_project_sub.is_new: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1") + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() + sub_conn.close() + + zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) try: - sub_project = neon_api.create_project(pg_version) - sub_project_id = sub_project["project"]["id"] - neon_api.wait_for_operation_to_finish(sub_project_id) + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) try: - pub_env = connection_parameters_to_env( - pub_project["connection_uris"][0]["connection_parameters"] - ) - sub_env = connection_parameters_to_env( - sub_project["connection_uris"][0]["connection_parameters"] - ) - pub_connstr = pub_project["connection_uris"][0]["connection_uri"] - sub_connstr = sub_project["connection_uris"][0]["connection_uri"] + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) - - pub_conn = psycopg2.connect(pub_connstr) - sub_conn = psycopg2.connect(sub_connstr) - pub_conn.autocommit = True - sub_conn.autocommit = True - with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - sub_cur.execute("truncate table pgbench_accounts") - sub_cur.execute("truncate table pgbench_history") - - pub_cur.execute( - "create publication pub1 for table pgbench_accounts, pgbench_history" - ) - sub_cur.execute( - f"create subscription sub1 connection '{pub_connstr}' publication pub1" - ) - - initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) - pub_conn.close() - sub_conn.close() - - zenbenchmark.record( - "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER - ) - - pub_workload = pg_bin.run_nonblocking( - ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env - ) - try: - sub_workload = pg_bin.run_nonblocking( - ["pgbench", "-c10", pgbench_duration, "-S"], - env=sub_env, - ) - try: - start = time.time() - while time.time() - start < test_duration_min * 60: - time.sleep(sync_interval_min * 60) - check_pgbench_still_running(pub_workload, "pub") - check_pgbench_still_running(sub_workload, "sub") - - pub_workload.terminate() - neon_api.restart_endpoint( - pub_project_id, - pub_endpoint_id, - ) - neon_api.wait_for_operation_to_finish(pub_project_id) - pub_workload = pg_bin.run_nonblocking( - ["pgbench", "-c10", pgbench_duration, "-Mprepared"], - env=pub_env, - ) - with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( - sub_connstr - ) as sub_conn: - with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - lag = measure_logical_replication_lag(sub_cur, pub_cur) - - log.info(f"Replica lagged behind master by {lag} seconds") - zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) - - # Measure storage to make sure replication information isn't bloating storage - sub_storage = neon_api.get_project_details(sub_project_id)["project"][ - "synthetic_storage_size" - ] - pub_storage = neon_api.get_project_details(pub_project_id)["project"][ - "synthetic_storage_size" - ] - zenbenchmark.record( - "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER - ) - zenbenchmark.record( - "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER - ) - - finally: - sub_workload.terminate() - finally: pub_workload.terminate() - except Exception as e: - error_occurred = True - log.error(f"Caught exception {e}") - log.error(traceback.format_exc()) + benchmark_project_pub.restart() + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], + env=pub_env, + ) + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + + # Measure storage to make sure replication information isn't bloating storage + sub_storage = benchmark_project_sub.get_synthetic_storage_size() + pub_storage = benchmark_project_pub.get_synthetic_storage_size() + zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER) finally: - if not error_occurred: - neon_api.delete_project(sub_project_id) - except Exception as e: - error_occurred = True - log.error(f"Caught exception {e}") - log.error(traceback.format_exc()) + sub_workload.terminate() finally: - assert not error_occurred - neon_api.delete_project(pub_project_id) + pub_workload.terminate() diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 32c1c52eea..354fc15745 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.64" +version = "0.10.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" +checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" dependencies = [ "bitflags 2.6.0", "cfg-if", @@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.102" +version = "0.9.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2" +checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" dependencies = [ "cc", "libc", diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index f2ee2b70aa..a7eda73d4c 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -168,7 +168,6 @@ def test_fully_custom_config(positive_env: NeonEnv): "refill_amount": 1000, "max": 1000, }, - "trace_read_requests": True, "walreceiver_connect_timeout": "13m", "image_layer_creation_check_threshold": 1, "switch_aux_file_policy": "cross-validation", diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 922a21a999..7cb85e3dd1 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): def check_pageserver(expect_success: bool, **conn_kwargs): check_connection( env.pageserver, - f"show {env.initial_tenant}", + f"pagestream {env.initial_tenant} {env.initial_timeline}", expect_success, **conn_kwargs, ) diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index eb503ddbfa..f2e3855c12 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -65,8 +65,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): "compaction_period": "1 s", "compaction_threshold": "2", "image_creation_threshold": "1", - # set PITR interval to be small, so we can do GC - "pitr_interval": "1 s", + # Disable PITR, this test will set an explicit space-based GC limit + "pitr_interval": "0 s", } ) diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 4d2cdb8e32..34791e5988 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -3,9 +3,16 @@ import asyncio from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.remote_storage import RemoteStorageKind +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response -def test_change_pageserver(neon_env_builder: NeonEnvBuilder): +def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver): + """ + A relatively low level test of reconfiguring a compute's pageserver at runtime. Usually this + is all done via the storage controller, but this test will disable the storage controller's compute + notifications, and instead update endpoints directly. + """ num_connections = 3 neon_env_builder.num_pageservers = 2 @@ -14,9 +21,24 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_start() + neon_env_builder.control_plane_compute_hook_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + ) + + def ignore_notify(request: Request): + # This test does direct updates to compute configuration: disable the storage controller's notification + log.info(f"Ignoring storage controller compute notification: {request.json}") + return Response(status=200) + + make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler( + ignore_notify + ) + env.neon_cli.create_branch("test_change_pageserver") endpoint = env.endpoints.create_start("test_change_pageserver") + # Put this tenant into a dual-attached state + assert env.get_tenant_pageserver(env.initial_tenant) == env.pageservers[0] alt_pageserver_id = env.pageservers[1].id env.pageservers[1].tenant_attach(env.initial_tenant) @@ -72,6 +94,7 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): env.pageservers[ 0 ].stop() # Stop the old pageserver just to make sure we're reading from the new one + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) execute("SELECT count(*) FROM foo") assert fetchone() == (100000,) @@ -82,9 +105,10 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): # # Since we're dual-attached, need to tip-off storage controller to treat the one we're # about to start as the attached pageserver - env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id) env.pageservers[0].start() env.pageservers[1].stop() + env.storage_controller.node_configure(env.pageservers[1].id, {"availability": "Offline"}) + env.storage_controller.reconcile_until_idle() endpoint.reconfigure(pageserver_id=env.pageservers[0].id) @@ -92,10 +116,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): assert fetchone() == (100000,) env.pageservers[0].stop() - # Since we're dual-attached, need to tip-off storage controller to treat the one we're - # about to start as the attached pageserver - env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id) env.pageservers[1].start() + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + env.storage_controller.reconcile_until_idle() # Test a (former) bug where a child process spins without updating its connection string # by executing a query separately. This query will hang until we issue the reconfigure. diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 49dcb9b86a..be787e0642 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -1,12 +1,17 @@ import enum import json import os +import time from typing import Optional import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + generate_uploads_and_deletions, +) from fixtures.pageserver.http import PageserverApiException +from fixtures.utils import wait_until from fixtures.workload import Workload AGGRESIVE_COMPACTION_TENANT_CONF = { @@ -140,6 +145,10 @@ def test_sharding_compaction( "image_layer_creation_check_threshold": 0, } + # Disable compression, as we can't estimate the size of layers with compression enabled + # TODO: implement eager layer cutting during compaction + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, @@ -257,3 +266,148 @@ def test_uploads_and_deletions( found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors) if not found_allowed_error: raise Exception("None of the allowed_errors occured in the log") + + +def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder): + """ + Check that repeated failures in compaction result in a circuit breaker breaking + """ + TENANT_CONF = { + # Very frequent runs to rack up failures quickly + "compaction_period": "100ms", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024 * 128, + # Compact small layers + "compaction_target_size": 1024 * 128, + "image_creation_threshold": 1, + } + + FAILPOINT = "delta-layer-writer-fail-before-finish" + BROKEN_LOG = ".*Circuit breaker broken!.*" + + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + workload = Workload(env, env.initial_tenant, env.initial_timeline) + workload.init() + + # Set a failpoint that will prevent compaction succeeding + env.pageserver.http_client().configure_failpoints((FAILPOINT, "return")) + + # Write some data to trigger compaction + workload.write_rows(1024, upload=False) + workload.write_rows(1024, upload=False) + workload.write_rows(1024, upload=False) + + def assert_broken(): + env.pageserver.assert_log_contains(BROKEN_LOG) + assert ( + env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total") + or 0 + ) == 1 + assert ( + env.pageserver.http_client().get_metric_value( + "pageserver_circuit_breaker_unbroken_total" + ) + or 0 + ) == 0 + + # Wait for enough failures to break the circuit breaker + # This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s + wait_until(60, 1, assert_broken) + + # Sleep for a while, during which time we expect that compaction will _not_ be retried + time.sleep(10) + + assert ( + env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total") + or 0 + ) == 1 + assert ( + env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_unbroken_total") + or 0 + ) == 0 + assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*") + + +@pytest.mark.parametrize("enabled", [True, False]) +def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool): + tenant_conf = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers as eagerly as possible + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + } + + # Explicitly enable/disable compression, rather than using default + if enabled: + neon_env_builder.pageserver_config_override = "image_compression='zstd'" + else: + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageserver = env.pageserver + ps_http = env.pageserver.http_client() + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + # Generate around 800k worth of easily compressible data to store + for v in range(100): + endpoint.safe_psql( + f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))" + ) + # run compaction to create image layers + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + + layer_map = ps_http.layer_map_info(tenant_id, timeline_id) + image_layer_count = 0 + delta_layer_count = 0 + for layer in layer_map.historic_layers: + if layer.kind == "Image": + image_layer_count += 1 + elif layer.kind == "Delta": + delta_layer_count += 1 + assert image_layer_count > 0 + assert delta_layer_count > 0 + + log.info(f"images: {image_layer_count}, deltas: {delta_layer_count}") + + bytes_in = pageserver.http_client().get_metric_value( + "pageserver_compression_image_in_bytes_total" + ) + bytes_out = pageserver.http_client().get_metric_value( + "pageserver_compression_image_out_bytes_total" + ) + assert bytes_in is not None + assert bytes_out is not None + log.info(f"Compression ratio: {bytes_out/bytes_in} ({bytes_out} in, {bytes_out} out)") + + if enabled: + # We are writing high compressible repetitive plain text, expect excellent compression + EXPECT_RATIO = 0.2 + assert bytes_out / bytes_in < EXPECT_RATIO + else: + # Nothing should be compressed if we disabled it. + assert bytes_out >= bytes_in + + # Destroy the endpoint and create a new one to resetthe caches + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + for v in range(100): + res = endpoint.safe_psql( + f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)" + ) + assert res[0][0] == 1 diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 7722828c79..91c7b97fdd 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -21,6 +21,10 @@ from fixtures.utils import human_bytes, wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" +# access times in the pageserver are stored at a very low resolution: to generate meaningfully different +# values, tests must inject sleeps +ATIME_RESOLUTION = 2 + @pytest.mark.parametrize("config_level_override", [None, 400]) def test_min_resident_size_override_handling( @@ -67,14 +71,11 @@ def test_min_resident_size_override_handling( @enum.unique class EvictionOrder(str, enum.Enum): - ABSOLUTE_ORDER = "absolute" RELATIVE_ORDER_EQUAL = "relative_equal" RELATIVE_ORDER_SPARE = "relative_spare" def config(self) -> Dict[str, Any]: - if self == EvictionOrder.ABSOLUTE_ORDER: - return {"type": "AbsoluteAccessed"} - elif self == EvictionOrder.RELATIVE_ORDER_EQUAL: + if self == EvictionOrder.RELATIVE_ORDER_EQUAL: return { "type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}, @@ -230,6 +231,9 @@ def _eviction_env( neon_env_builder.num_pageservers = num_pageservers neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + # Disable compression support for EvictionEnv to get larger layer sizes + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + # initial tenant will not be present on this pageserver env = neon_env_builder.init_configs() env.start() @@ -381,7 +385,7 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_evicts_until_pressure_is_relieved( eviction_env: EvictionEnv, order: EvictionOrder @@ -415,7 +419,7 @@ def test_pageserver_evicts_until_pressure_is_relieved( @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_respects_overridden_resident_size( eviction_env: EvictionEnv, order: EvictionOrder @@ -492,7 +496,7 @@ def test_pageserver_respects_overridden_resident_size( @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder): """ @@ -523,7 +527,6 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E @pytest.mark.parametrize( "order", [ - EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE, ], @@ -547,6 +550,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): (tenant_id, timeline_id) = warm # make picked tenant more recently used than the other one + time.sleep(ATIME_RESOLUTION) env.warm_up_tenant(tenant_id) # Build up enough pressure to require evictions from both tenants, @@ -569,63 +573,38 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): later_tenant_usage < du_by_timeline[tenant] ), "all tenants should have lost some layers" - warm_size = later_du_by_timeline[warm] - cold_size = later_du_by_timeline[cold] + # with relative order what matters is the amount of layers, with a + # fudge factor of whether the eviction bothers tenants with highest + # layer count the most. last accessed times between tenants does not + # matter. + assert order in [EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE] + layers_now = env.count_layers_per_tenant(env.pageserver) - if order == EvictionOrder.ABSOLUTE_ORDER: - # bounds for warmed_size - warm_lower = 0.5 * du_by_timeline[warm] + expected_ratio = later_total_on_disk / total_on_disk + log.info( + f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio" + ) - # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. - # So, check for up to 3 here. - warm_upper = warm_lower + 3 * env.layer_size + for tenant_id, original_count in tenant_layers.items(): + count_now = layers_now[tenant_id] + ratio = count_now / original_count + abs_diff = abs(ratio - expected_ratio) + assert original_count > count_now - cold_upper = 2 * env.layer_size - log.info(f"tenants: warm={warm[0]}, cold={cold[0]}") + expectation = 0.06 log.info( - f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" + f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" ) - log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") - - assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" - assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" - - assert ( - cold_size < cold_upper - ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" - else: - # with relative order what matters is the amount of layers, with a - # fudge factor of whether the eviction bothers tenants with highest - # layer count the most. last accessed times between tenants does not - # matter. - layers_now = env.count_layers_per_tenant(env.pageserver) - - expected_ratio = later_total_on_disk / total_on_disk - log.info( - f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio" - ) - - for tenant_id, original_count in tenant_layers.items(): - count_now = layers_now[tenant_id] - ratio = count_now / original_count - abs_diff = abs(ratio - expected_ratio) - assert original_count > count_now - - expectation = 0.06 - log.info( - f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" - ) - # in this test case both relative_spare and relative_equal produce - # the same outcomes; this must be a quantization effect of similar - # sizes (-s4 and -s6) and small (5MB) layer size. - # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 - assert abs_diff < expectation + # in this test case both relative_spare and relative_equal produce + # the same outcomes; this must be a quantization effect of similar + # sizes (-s4 and -s6) and small (5MB) layer size. + # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 + assert abs_diff < expectation @pytest.mark.parametrize( "order", [ - EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE, ], @@ -648,6 +627,10 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or for scale in [1, 1, 1, 4]: timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale)) + # Eviction times are stored at a low resolution. We must ensure that the time between + # tenants is long enough for the pageserver to distinguish them. + time.sleep(ATIME_RESOLUTION) + env.neon_cli.safekeeper_stop() for (tenant_id, timeline_id), scale in timelines: @@ -677,14 +660,7 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order" log.info(f"{ratios}") - if order == EvictionOrder.ABSOLUTE_ORDER: - # first tenant loses most - assert ratios[0] <= ratios[1], "first should lose the most" - assert ratios[1] < ratios[2], "second should lose some" - assert ratios[1] < 1.0 - assert ratios[2] <= ratios[3], "third might not lose" - assert ratios[3] == 1.0, "tenant created last does not lose" - elif order == EvictionOrder.RELATIVE_ORDER_EQUAL: + if order == EvictionOrder.RELATIVE_ORDER_EQUAL: assert all([x for x in ratios if x < 1.0]), "all tenants lose layers" elif order == EvictionOrder.RELATIVE_ORDER_SPARE: # with different layer sizes and pg versions, there are different combinations @@ -747,7 +723,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): "type": "Failure", "mocked_error": "EIO", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO") @@ -781,7 +757,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) wait_until( @@ -794,6 +770,16 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): wait_until(2, 2, less_than_max_usage_pct) + # Disk usage candidate collection only takes into account active tenants. + # However, the statvfs call takes into account the entire tenants directory, + # which includes tenants which haven't become active yet. + # + # After re-starting the pageserver, disk usage eviction may kick in *before* + # both tenants have become active. Hence, the logic will try to satisfy the + # disk usage requirements by evicting everything belonging to the active tenant, + # and hence violating the tenant minimum resident size. + env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) + def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): """ @@ -824,7 +810,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) wait_until( diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index 6465bdfd21..4c53e4e2fd 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -89,7 +89,7 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): ) conn = endpoint.connect() cur = conn.cursor() - cur.execute("create extension neon version '1.4'") + cur.execute("create extension neon") cur.execute( "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))" ) diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 41283e4d2c..66afe9ddfd 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -247,6 +247,12 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of cur.execute( "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')" ) + # do the peek second time: we've had a bug using wrong memory context + # for NeonWALReader leading to the crash in this case. + log.info("peek_changes again") + cur.execute( + "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')" + ) # Tests that walsender correctly blocks until WAL is downloaded from safekeepers diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index 5637f160cf..bdc5ca907e 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -1,6 +1,10 @@ -import time +from __future__ import annotations -from fixtures.neon_fixtures import NeonEnv +import time +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv def test_migrations(neon_simple_env: NeonEnv): @@ -11,17 +15,14 @@ def test_migrations(neon_simple_env: NeonEnv): endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() - endpoint.wait_for_migrations() - - num_migrations = 9 + num_migrations = 10 + endpoint.wait_for_migrations(num_migrations=num_migrations) with endpoint.cursor() as cur: cur.execute("SELECT id FROM neon_migration.migration_id") migration_id = cur.fetchall() assert migration_id[0][0] == num_migrations - endpoint.assert_log_contains(f"INFO handle_migrations: Ran {num_migrations} migrations") - endpoint.stop() endpoint.start() # We don't have a good way of knowing that the migrations code path finished executing @@ -31,5 +32,3 @@ def test_migrations(neon_simple_env: NeonEnv): cur.execute("SELECT id FROM neon_migration.migration_id") migration_id = cur.fetchall() assert migration_id[0][0] == num_migrations - - endpoint.assert_log_contains("INFO handle_migrations: Ran 0 migrations") diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index e83aaf91c6..bb844244e3 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -24,7 +24,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.3",) + assert cur.fetchone() == ("1.4",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") res = cur.fetchall() log.info(res) @@ -48,10 +48,10 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.3",) + assert cur.fetchone() == ("1.4",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"] - current_version = "1.3" + current_version = "1.4" for idx, begin_version in enumerate(all_versions): for target_version in all_versions[idx + 1 :]: if current_version != begin_version: diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py new file mode 100644 index 0000000000..a94ae99ed9 --- /dev/null +++ b/test_runner/regress/test_oid_overflow.py @@ -0,0 +1,45 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_oid_overflow(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + endpoint = env.endpoints.create_start("main") + + conn = endpoint.connect() + cur = conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("CREATE TABLE t1(x integer)") + cur.execute("INSERT INTO t1 values (1)") + cur.execute("CREATE TABLE t2(x integer)") + cur.execute("INSERT INTO t2 values (2)") + + cur.execute("SELECT x from t1") + assert cur.fetchone() == (1,) + cur.execute("SELECT x from t2") + assert cur.fetchone() == (2,) + + cur.execute("VACUUM FULL t1") + cur.execute("VACUUM FULL t1") + cur.execute("vacuum pg_class") + cur.execute("SELECT relfilenode FROM pg_class where relname='t1'") + oid = cur.fetchall()[0][0] + log.info(f"t1.relfilenode={oid}") + + cur.execute("set statement_timeout=0") + cur.execute(f"select test_consume_oids({oid-1})") + cur.execute("VACUUM FULL t2") + + cur.execute("SELECT relfilenode FROM pg_class where relname='t2'") + oid = cur.fetchall()[0][0] + log.info(f"t2.relfilenode={oid}") + + cur.execute("SELECT clear_buffer_cache()") + + cur.execute("SELECT x from t1") + assert cur.fetchone() == (1,) + cur.execute("SELECT x from t2") + assert cur.fetchone() == (2,) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 4a25dfd874..c8249bb2ce 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -764,7 +764,9 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder): """ Pause using a pausable_failpoint longer than the client timeout to simulate the timeout happening. """ - neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + # running this test is not reliable against REAL_S3, because operations can + # take longer than 1s we want to use as a timeout + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) assert isinstance(neon_env_builder.pageserver_remote_storage, S3Storage) neon_env_builder.pageserver_remote_storage.custom_timeout = "1s" diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index caeae7fd15..28dbf40bed 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -1,8 +1,5 @@ -import subprocess -from pathlib import Path from typing import Optional -import toml from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, @@ -13,67 +10,6 @@ from fixtures.pageserver.http import PageserverHttpClient from fixtures.utils import wait_until -def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path): - """ - NB: The neon_local doesn't use `--init` mode anymore, but our production - deployment still does => https://github.com/neondatabase/aws/pull/1322 - """ - workdir = neon_simple_env.pageserver.workdir - pageserver_config = workdir / "pageserver.toml" - pageserver_bin = neon_binpath / "pageserver" - - def run_pageserver(args): - return subprocess.run( - [str(pageserver_bin), "-D", str(workdir), *args], - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - neon_simple_env.pageserver.stop() - - with open(neon_simple_env.pageserver.config_toml_path, "r") as f: - ps_config = toml.load(f) - - required_config_keys = [ - "pg_distrib_dir", - "listen_pg_addr", - "listen_http_addr", - "pg_auth_type", - "http_auth_type", - # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748 - # "tenant_config", - ] - required_config_overrides = [ - f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys - ] - - pageserver_config.unlink() - - bad_init = run_pageserver(["--init", *required_config_overrides]) - assert ( - bad_init.returncode == 1 - ), "pageserver should not be able to init new config without the node id" - assert 'missing config value "id"' in bad_init.stderr - assert not pageserver_config.exists(), "config file should not be created after init error" - - good_init_cmd = [ - "--init", - f"--config-override=id={ps_config['id']}", - *required_config_overrides, - ] - completed_init = run_pageserver(good_init_cmd) - assert ( - completed_init.returncode == 0 - ), "pageserver should be able to create a new config with the node id given" - assert pageserver_config.exists(), "config file should be created successfully" - - bad_reinit = run_pageserver(good_init_cmd) - assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists" - assert "config file already exists" in bad_reinit.stderr - - def check_client(env: NeonEnv, client: PageserverHttpClient): pg_version = env.pg_version initial_tenant = env.initial_tenant diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 7ce38c5c3c..041942cda3 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -22,7 +22,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgBin, - StorageScrubber, generate_uploads_and_deletions, ) from fixtures.pageserver.common_types import parse_layer_file_name @@ -215,7 +214,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = StorageScrubber(neon_env_builder).scan_metadata() + metadata_summary = env.storage_scrubber.scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index cea35a6acb..24a37b04ec 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -58,7 +58,6 @@ def test_metric_collection( metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" metric_collection_bucket={remote_storage_to_toml_inline_table(neon_env_builder.pageserver_remote_storage)} - cached_metric_collection_interval="0s" synthetic_size_calculation_interval="3s" """ @@ -216,7 +215,6 @@ def test_metric_collection_cleans_up_tempfile( neon_env_builder.pageserver_config_override = f""" metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" - cached_metric_collection_interval="0s" synthetic_size_calculation_interval="3s" """ diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 0416078ebc..58d61eab0d 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -7,7 +7,7 @@ from typing import Any, Dict, Optional import pytest from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubber +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, @@ -234,7 +234,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check # that the scrubber sees it and cleans it up. We do this before the final attach+validate pass, # to also validate that the scrubber isn't breaking anything. - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] > 0 @@ -555,7 +555,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - StorageScrubber(neon_env_builder).scan_metadata() + env.storage_scrubber.scan_metadata() # Detach secondary and delete tenant # =================================== diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 54b493ec70..d5b5ac3f75 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -117,7 +117,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End # Run the main PostgreSQL regression tests, in src/test/regress. # -@pytest.mark.timeout(600) +@pytest.mark.timeout(900) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) def test_pg_regress( neon_env_builder: NeonEnvBuilder, @@ -186,6 +186,7 @@ def test_pg_regress( # Run the PostgreSQL "isolation" tests, in src/test/isolation. # +@pytest.mark.timeout(600) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) def test_isolation( neon_env_builder: NeonEnvBuilder, diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py deleted file mode 100644 index cc5853b727..0000000000 --- a/test_runner/regress/test_read_trace.py +++ /dev/null @@ -1,39 +0,0 @@ -from contextlib import closing - -from fixtures.common_types import Lsn -from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.utils import query_scalar - - -# This test demonstrates how to collect a read trace. It's useful until -# it gets replaced by a test that actually does stuff with the trace. -# -# Additionally, tests that pageserver is able to create tenants with custom configs. -def test_read_request_tracing(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 - env = neon_env_builder.init_start( - initial_tenant_conf={ - "trace_read_requests": "true", - } - ) - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - endpoint = env.endpoints.create_start("main") - - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("create table t (i integer);") - cur.execute(f"insert into t values (generate_series(1,{10000}));") - cur.execute("select count(*) from t;") - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - # wait until pageserver receives that data - pageserver_http = env.pageserver.http_client() - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) - - # Stop postgres so we drop the connection and flush the traces - endpoint.stop() - - trace_path = env.pageserver.workdir / "traces" / str(tenant_id) / str(timeline_id) - assert trace_path.exists() diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index fac7fe9dee..09f941f582 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -577,7 +577,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( > 0 ) - wait_until(20, 0.1, assert_compacted_and_uploads_queued) + wait_until(200, 0.1, assert_compacted_and_uploads_queued) # Regardless, give checkpoint some time to block for good. # Not strictly necessary, but might help uncover failure modes in the future. @@ -619,7 +619,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( ) # timeline deletion should be unblocking checkpoint ops - checkpoint_thread.join(2.0) + checkpoint_thread.join(20.0) assert not checkpoint_thread.is_alive() # Just to be sure, unblock ongoing uploads. If the previous assert was incorrect, or the prometheus metric broken, diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py index 17d476a8a6..0d95109d6b 100644 --- a/test_runner/regress/test_replica_start.py +++ b/test_runner/regress/test_replica_start.py @@ -210,7 +210,11 @@ def test_replica_start_wait_subxids_finish(neon_simple_env: NeonEnv): # Start it in a separate thread, so that we can do other stuff while it's # blocked waiting for the startup to finish. wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) - secondary = env.endpoints.new_replica(origin=primary, endpoint_id="secondary") + secondary = env.endpoints.new_replica( + origin=primary, + endpoint_id="secondary", + config_lines=["neon.running_xacts_overflow_policy='wait'"], + ) start_secondary_thread = threading.Thread(target=secondary.start) start_secondary_thread.start() @@ -644,3 +648,43 @@ def test_replica_start_with_prepared_xacts_with_many_subxacts(neon_simple_env: N wait_replica_caughtup(primary, secondary) secondary_cur.execute("select count(*) from t") assert secondary_cur.fetchone() == (200001,) + + +def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + large number of unsued XIDs, caused by XID alignment and frequent primary restarts + """ + n_restarts = 50 + + # Initialize the primary and a test table + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + with primary.cursor() as primary_cur: + primary_cur.execute("create table t(pk serial primary key, payload integer)") + + for _ in range(n_restarts): + with primary.cursor() as primary_cur: + primary_cur.execute("insert into t (payload) values (0)") + # restart primary + primary.stop("immediate") + primary.start() + + # Wait for the WAL to be flushed + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + # stop primary to check that we can start replica without it + primary.stop(mode="immediate") + + # Create a replica. It should start up normally, because of ignore policy + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=["neon.running_xacts_overflow_policy='ignore'"], + ) + + # Check that replica see all changes + with secondary.cursor() as secondary_cur: + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (n_restarts,) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 4471237900..90c6e26d01 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -12,7 +12,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, StorageControllerApiException, - StorageScrubber, last_flush_lsn_upload, tenant_get_shards, wait_for_last_flush_lsn, @@ -128,7 +127,7 @@ def test_sharding_smoke( # Check the scrubber isn't confused by sharded content, then disable # it during teardown because we'll have deleted by then - StorageScrubber(neon_env_builder).scan_metadata() + env.storage_scrubber.scan_metadata() neon_env_builder.scrub_on_exit = False env.storage_controller.pageserver_api().tenant_delete(tenant_id) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index d37f7aae3d..741f16685e 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1611,3 +1611,91 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): env.storage_controller.cancel_node_drain(ps_id_to_drain) env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2) + + +@pytest.mark.parametrize("while_offline", [True, False]) +def test_storage_controller_node_deletion( + neon_env_builder: NeonEnvBuilder, + compute_reconfigure_listener: ComputeReconfigure, + while_offline: bool, +): + """ + Test that deleting a node works & properly reschedules everything that was on the node. + """ + neon_env_builder.num_pageservers = 3 + env = neon_env_builder.init_configs() + env.start() + + tenant_count = 10 + shard_count_per_tenant = 8 + tenant_ids = [] + for _ in range(0, tenant_count): + tid = TenantId.generate() + tenant_ids.append(tid) + env.neon_cli.create_tenant( + tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant + ) + + victim = env.pageservers[-1] + + # The procedure a human would follow is: + # 1. Mark pageserver scheduling=pause + # 2. Mark pageserver availability=offline to trigger migrations away from it + # 3. Wait for attachments to all move elsewhere + # 4. Call deletion API + # 5. Stop the node. + + env.storage_controller.node_configure(victim.id, {"scheduling": "Pause"}) + + if while_offline: + victim.stop(immediate=True) + env.storage_controller.node_configure(victim.id, {"availability": "Offline"}) + + def assert_shards_migrated(): + counts = get_node_shard_counts(env, tenant_ids) + elsewhere = sum(v for (k, v) in counts.items() if k != victim.id) + log.info(f"Shards on nodes other than on victim: {elsewhere}") + assert elsewhere == tenant_count * shard_count_per_tenant + + wait_until(30, 1, assert_shards_migrated) + + log.info(f"Deleting pageserver {victim.id}") + env.storage_controller.node_delete(victim.id) + + if not while_offline: + + def assert_victim_evacuated(): + counts = get_node_shard_counts(env, tenant_ids) + count = counts[victim.id] + log.info(f"Shards on node {victim.id}: {count}") + assert count == 0 + + wait_until(30, 1, assert_victim_evacuated) + + # The node should be gone from the list API + assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] + + # No tenants should refer to the node in their intent + for tenant_id in tenant_ids: + describe = env.storage_controller.tenant_describe(tenant_id) + for shard in describe["shards"]: + assert shard["node_attached"] != victim.id + assert victim.id not in shard["node_secondary"] + + # Reconciles running during deletion should all complete + # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting + # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3 + # test that hasn't uploaded any heatmaps for secondaries. + # In the interim, just do a reconcile_all to enable the consistency check. + # env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_all() + + # Controller should pass its own consistency checks + env.storage_controller.consistency_check() + + # The node should stay gone across a restart + env.storage_controller.stop() + env.storage_controller.start() + assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] + env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above. + env.storage_controller.consistency_check() diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 35ae61c380..7c411a6b84 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -1,14 +1,20 @@ import os +import pprint import shutil +import threading +import time +from concurrent.futures import ThreadPoolExecutor from typing import Optional import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.log_helper import log from fixtures.neon_fixtures import ( + NeonEnv, NeonEnvBuilder, - StorageScrubber, ) from fixtures.remote_storage import S3Storage, s3_storage +from fixtures.utils import wait_until from fixtures.workload import Workload @@ -60,8 +66,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: output_path = neon_env_builder.test_output_dir / "snapshot" os.makedirs(output_path) - scrubber = StorageScrubber(neon_env_builder) - scrubber.tenant_snapshot(tenant_id, output_path) + env.storage_scrubber.tenant_snapshot(tenant_id, output_path) assert len(os.listdir(output_path)) > 0 @@ -111,6 +116,14 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: workload.validate() +def drop_local_state(env: NeonEnv, tenant_id: TenantId): + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.reconcile_until_idle() + + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + + @pytest.mark.parametrize("shard_count", [None, 4]) def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) @@ -133,28 +146,312 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt # For each cycle, detach and attach the tenant to bump the generation, and do some writes to generate uploads for _i in range(0, n_cycles): - env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) - env.storage_controller.reconcile_until_idle() - - env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) - env.storage_controller.reconcile_until_idle() + drop_local_state(env, tenant_id) # This write includes remote upload, will generate an index in this generation workload.write_rows(1) # With a high min_age, the scrubber should decline to delete anything - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600) + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == 0 # If targeting a different tenant, the scrubber shouldn't do anything - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc( + gc_summary = env.storage_scrubber.pageserver_physical_gc( min_age_secs=1, tenant_ids=[TenantId.generate()] ) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == 0 # With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations - gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count + + +@pytest.mark.parametrize("shard_count", [None, 2]) +def test_scrubber_physical_gc_ancestors( + neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] +): + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=shard_count, + conf={ + # Small layers and low compaction thresholds, so that when we split we can expect some to + # be dropped by child shards + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + # Disable background compaction, we will do it explicitly + "compaction_period": "0s", + # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas + # and makes them GC'able + "pitr_interval": "0s", + }, + ) + + # Make sure the original shard has some layers + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + + new_shard_count = 4 + assert shard_count is None or new_shard_count > shard_count + shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) + env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately + + # Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which + # compacts, and we only want to do tha explicitly later in the test. + workload.write_rows(100, upload=False) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + log.info(f"Waiting for shard {shard} on pageserver {ps.id}") + ps.http_client().timeline_checkpoint( + shard, timeline_id, compact=False, wait_until_uploaded=True + ) + + # Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once + # a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even + # if they were logically deleted before the shard split, just not physically deleted yet because of the queue. + for ps in env.pageservers: + ps.http_client().deletion_queue_flush(execute=True) + + # Before compacting, all the layers in the ancestor should still be referenced by the children: the scrubber + # should not erase any ancestor layers + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] == 0 + + # Write some data and compact: compacting, some ancestor layers should no longer be needed by children + # (the compaction is part of the checkpoint that Workload does for us) + workload.churn_rows(100) + workload.churn_rows(100) + workload.churn_rows(100) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + ps.http_client().timeline_compact(shard, timeline_id) + ps.http_client().timeline_gc(shard, timeline_id, 0) + + # We will use a min_age_secs=1 threshold for deletion, let it pass + time.sleep(2) + + # Our time threshold should be respected: check that with a high threshold we delete nothing + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] == 0 + + # Now run with a low time threshold: deletions of ancestor layers should be executed + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] > 0 + + # We deleted some layers: now check we didn't corrupt the tenant by doing so. Detach and + # attach it, to drop any local state, then check it's still readable. + workload.stop() + drop_local_state(env, tenant_id) + + workload.validate() + + +def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): + """ + Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards + which don't reference an ancestor, but some child shards that don't exist yet, then we do not incorrectly + GC any ancestor layers. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + initial_shard_count = 2 + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=initial_shard_count, + conf={ + # Small layers and low compaction thresholds, so that when we split we can expect some to + # be dropped by child shards + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + # Disable background compaction, we will do it explicitly + "compaction_period": "0s", + # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas + # and makes them GC'able + "pitr_interval": "0s", + }, + ) + + unstuck = threading.Event() + + def stuck_split(): + # Pause our shard split after the first shard but before the second, such that when we run + # the scrub, the S3 bucket contains shards 0002, 0101, 0004, 0204 (but not 0104, 0304). + env.storage_controller.configure_failpoints( + ("shard-split-post-remote-sleep", "return(3600000)") + ) + try: + split_response = env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + except Exception as e: + log.info(f"Split failed with {e}") + else: + if not unstuck.is_set(): + raise RuntimeError(f"Split succeeded unexpectedly ({split_response})") + + with ThreadPoolExecutor(max_workers=1) as threads: + log.info("Starting hung shard split") + stuck_split_fut = threads.submit(stuck_split) + + # Let the controller reach the failpoint + wait_until( + 10, + 1, + lambda: env.storage_controller.assert_log_contains( + 'failpoint "shard-split-post-remote-sleep": sleeping' + ), + ) + + # Run compaction on the new child shards, so that they drop some refs to their parent + child_shards = [ + TenantShardId(tenant_id, 0, 4), + TenantShardId(tenant_id, 2, 4), + ] + log.info("Compacting first two children") + for child in child_shards: + env.get_tenant_pageserver( + TenantShardId(tenant_id, 0, initial_shard_count) + ).http_client().timeline_compact(child, timeline_id) + + # Check that the other child shards weren't created + assert env.get_tenant_pageserver(TenantShardId(tenant_id, 1, 4)) is None + assert env.get_tenant_pageserver(TenantShardId(tenant_id, 3, 4)) is None + + # Run scrubber: it should not incorrectly interpret the **04 shards' lack of refs to all + # ancestor layers as a reason to GC them, because it should realize that a split is in progress. + # (GC requires that controller does not indicate split in progress, and that if we see the highest + # shard count N, then there are N shards present with that shard count). + gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + log.info(f"Ran physical GC partway through split: {gc_output}") + assert gc_output["ancestor_layers_deleted"] == 0 + assert gc_output["remote_storage_errors"] == 0 + assert gc_output["controller_api_errors"] == 0 + + # Storage controller shutdown lets our split request client complete + log.info("Stopping storage controller") + unstuck.set() + env.storage_controller.allowed_errors.append(".*Timed out joining HTTP server task.*") + env.storage_controller.stop() + stuck_split_fut.result() + + # Restart the controller and retry the split with the failpoint disabled, this should + # complete successfully and result in an S3 state that allows the scrubber to proceed with removing ancestor layers + log.info("Starting & retrying split") + env.storage_controller.start() + env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + + # The other child shards exist now, we can compact them to drop refs to ancestor + log.info("Compacting second two children") + for child in [ + TenantShardId(tenant_id, 1, 4), + TenantShardId(tenant_id, 3, 4), + ]: + env.get_tenant_pageserver(child).http_client().timeline_compact(child, timeline_id) + + gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + log.info(f"Ran physical GC after split completed: {gc_output}") + assert gc_output["ancestor_layers_deleted"] > 0 + assert gc_output["remote_storage_errors"] == 0 + assert gc_output["controller_api_errors"] == 0 + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_scrubber_scan_pageserver_metadata( + neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] +): + """ + Create some layers. Delete an object listed in index. Run scrubber and see if it detects the defect. + """ + + # Use s3_storage so we could test out scrubber. + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1 + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + + # Create some layers. + + workload = Workload(env, env.initial_tenant, env.initial_timeline) + workload.init() + + for _ in range(3): + workload.write_rows(128) + + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + for _ in range(3): + workload.write_rows(128) + + # Get the latest index for a particular timeline. + + tenant_shard_id = TenantShardId(env.initial_tenant, 0, shard_count if shard_count else 0) + + assert isinstance(env.pageserver_remote_storage, S3Storage) + timeline_path = env.pageserver_remote_storage.timeline_path( + tenant_shard_id, env.initial_timeline + ) + + client = env.pageserver_remote_storage.client + bucket = env.pageserver_remote_storage.bucket_name + objects = client.list_objects_v2(Bucket=bucket, Prefix=f"{timeline_path}/", Delimiter="").get( + "Contents", [] + ) + keys = [obj["Key"] for obj in objects] + index_keys = list(filter(lambda s: s.startswith(f"{timeline_path}/index_part"), keys)) + assert len(index_keys) > 0 + + latest_index_key = env.pageserver_remote_storage.get_latest_index_key(index_keys) + log.info(f"{latest_index_key=}") + + index = env.pageserver_remote_storage.download_index_part(latest_index_key) + + assert len(index.layer_metadata) > 0 + it = iter(index.layer_metadata.items()) + + scan_summary = env.storage_scrubber.scan_metadata() + assert not scan_summary["with_warnings"] + assert not scan_summary["with_errors"] + + # Delete a layer file that is listed in the index. + layer, metadata = next(it) + log.info(f"Deleting {timeline_path}/{layer.to_str()}") + delete_response = client.delete_object( + Bucket=bucket, + Key=f"{timeline_path}/{layer.to_str()}-{metadata.generation:08x}", + ) + log.info(f"delete response: {delete_response}") + + # Check scan summary. Expect it to be a L0 layer so only emit warnings. + scan_summary = env.storage_scrubber.scan_metadata() + log.info(f"{pprint.pformat(scan_summary)}") + assert len(scan_summary["with_warnings"]) > 0 diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 80fb2b55b8..9fb7324fa1 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -1,16 +1,14 @@ import json -from contextlib import closing from typing import Any, Dict -import psycopg2.extras from fixtures.common_types import Lsn -from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import wait_until +from fixtures.workload import Workload def test_tenant_config(neon_env_builder: NeonEnvBuilder): @@ -63,25 +61,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): # check the configuration of the default tenant # it should match global configuration - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - log.info(f"show {env.initial_tenant}") - pscur.execute(f"show {env.initial_tenant}") - res = pscur.fetchone() - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 10000, - "compaction_target_size": 1048576, - "compaction_period": 20, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 60 * 60, - "image_creation_threshold": 3, - "pitr_interval": 604800, # 7 days - }.items() - ), f"Unexpected res: {res}" default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant) assert ( not default_tenant_config.tenant_specific_overrides @@ -103,25 +82,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): } # check the configuration of the new tenant - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 20000, - "compaction_target_size": 1048576, - "compaction_period": 20, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 30, - "image_creation_threshold": 3, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" new_tenant_config = http_client.tenant_config(tenant_id=tenant) new_specific_config = new_tenant_config.tenant_specific_overrides assert new_specific_config["checkpoint_distance"] == 20000 @@ -166,25 +126,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): conf=conf_update, ) - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after config res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 15000, - "compaction_target_size": 1048576, - "compaction_period": 80, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 80, - "image_creation_threshold": 2, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" updated_tenant_config = http_client.tenant_config(tenant_id=tenant) updated_specific_config = updated_tenant_config.tenant_specific_overrides assert updated_specific_config["checkpoint_distance"] == 15000 @@ -222,25 +163,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start() - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after restart res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 15000, - "compaction_target_size": 1048576, - "compaction_period": 80, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 80, - "image_creation_threshold": 2, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" restarted_tenant_config = http_client.tenant_config(tenant_id=tenant) assert ( restarted_tenant_config == updated_tenant_config @@ -283,19 +205,10 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start() - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after restart res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "compaction_period": 20, - "pitr_interval": 60, - }.items() - ), f"Unexpected res: {res}" + restarted_final_tenant_config = http_client.tenant_config(tenant_id=tenant) + assert ( + restarted_final_tenant_config == final_tenant_config + ), "Updated config should not change after the restart" def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): @@ -353,6 +266,13 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( (tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline ps_http = env.pageserver.http_client() + # When we evict/download layers, we will use this Workload to generate getpage requests + # that touch some layers, as otherwise the pageserver doesn't report totally unused layers + # as problems when they have short residence duration. + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + def get_metric(): metrics = ps_http.get_metrics() metric = metrics.query_one( @@ -373,6 +293,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert default_value == "1day" ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.value) > 0, "metric is updated" @@ -393,6 +314,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert int(metric.value) == 0 ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60 @@ -406,6 +328,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert int(metric.value) == 0, "value resets if label changes" ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60 diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 1d7c8b8e31..6d20b3d0de 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -5,7 +5,6 @@ from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, - StorageScrubber, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException @@ -325,7 +324,6 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) remote_storage_kind = RemoteStorageKind.MOCK_S3 neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - scrubber = StorageScrubber(neon_env_builder) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) ps_http = env.pageserver.http_client() @@ -340,7 +338,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) env.stop() - result = scrubber.scan_metadata() + result = env.storage_scrubber.scan_metadata() assert result["with_warnings"] == [] env.start() @@ -348,5 +346,5 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) ps_http.tenant_delete(tenant_id) env.stop() - scrubber.scan_metadata() + env.storage_scrubber.scan_metadata() assert result["with_warnings"] == [] diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 04b3fdd80f..0ebf714de0 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -45,17 +45,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): # Failure to write a config to local disk makes the pageserver assume that local disk is bad and abort the process pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) - # Storage controller will see a torn TCP connection when the crash point is reached, and follow an unclean 500 error path - neon_simple_env.storage_controller.allowed_errors.extend( - [ - ".*Reconcile not done yet while creating tenant.*", - ".*Reconcile error: receive body: error sending request.*", - ".*Error processing HTTP request: InternalServerError.*", - ] - ) + tenant_id = TenantId.generate() - with pytest.raises(Exception, match="error sending request"): - _ = neon_simple_env.neon_cli.create_tenant() + with pytest.raises(requests.exceptions.ConnectionError, match="Connection aborted"): + neon_simple_env.pageserver.http_client().tenant_attach(tenant_id=tenant_id, generation=1) # Any files left behind on disk during failed creation do not prevent # a retry from succeeding. Restart pageserver with no failpoints. diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 7bf49a0874..b62398d427 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -52,8 +52,8 @@ def test_threshold_based_eviction( "kind": "NoEviction" } - eviction_threshold = 5 - eviction_period = 1 + eviction_threshold = 10 + eviction_period = 2 ps_http.set_tenant_config( tenant_id, { diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 606ce203cd..38f8dfa885 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1,5 +1,7 @@ import datetime import enum +import threading +import time from concurrent.futures import ThreadPoolExecutor from queue import Empty, Queue from threading import Barrier @@ -9,14 +11,17 @@ import pytest from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( + LogCursor, NeonEnvBuilder, PgBin, + flush_ep_to_pageserver, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException -from fixtures.pageserver.utils import wait_timeline_detail_404 -from fixtures.remote_storage import LocalFsStorage -from fixtures.utils import assert_pageserver_backups_equal +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404 +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind +from fixtures.utils import assert_pageserver_backups_equal, wait_until +from requests import ReadTimeout def by_end_lsn(info: HistoricLayerInfo) -> Lsn: @@ -160,7 +165,7 @@ def test_ancestor_detach_branched_from( ) all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert all_reparented == set() + assert all_reparented == [] if restart_after: env.pageserver.stop() @@ -269,7 +274,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None) all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert all_reparented == {reparented, same_branchpoint} + assert set(all_reparented) == {reparented, same_branchpoint} env.pageserver.quiesce_tenants() @@ -529,7 +534,7 @@ def test_compaction_induced_by_detaches_in_history( for _, timeline_id in skip_main: reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert reparented == set(), "we have no earlier branches at any level" + assert reparented == [], "we have no earlier branches at any level" post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total" @@ -559,23 +564,49 @@ def test_compaction_induced_by_detaches_in_history( assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set()) -def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() - env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) +@pytest.mark.parametrize("sharded", [True, False]) +def test_timeline_ancestor_detach_idempotent_success( + neon_env_builder: NeonEnvBuilder, sharded: bool +): + shards = 2 if sharded else 1 - client = env.pageserver.http_client() + neon_env_builder.num_pageservers = shards + env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None) - with pytest.raises(PageserverApiException, match=".* no ancestors") as info: - client.detach_ancestor(env.initial_tenant, env.initial_timeline) - assert info.value.status_code == 409 + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for ps in pageservers.values(): + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + if sharded: + # FIXME: should this be in the neon_env_builder.init_start? + env.storage_controller.reconcile_until_idle() + client = env.storage_controller.pageserver_api() + else: + client = env.pageserver.http_client() first_branch = env.neon_cli.create_branch("first_branch") - second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") - # funnily enough this does not have a prefix - with pytest.raises(PageserverApiException, match="too many ancestors") as info: - client.detach_ancestor(env.initial_tenant, second_branch) - assert info.value.status_code == 400 + _ = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + + # these two will be reparented, and they should be returned in stable order + # from pageservers OR otherwise there will be an `error!` logging from + # storage controller + reparented1 = env.neon_cli.create_branch("first_reparented", ancestor_branch_name="main") + reparented2 = env.neon_cli.create_branch("second_reparented", ancestor_branch_name="main") + + first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch) + assert set(first_reparenting_response) == {reparented1, reparented2} + + # FIXME: this should be done by the http req handler + for ps in pageservers.values(): + ps.quiesce_tenants() + + for _ in range(5): + # once completed, we can retry this how many times + assert ( + client.detach_ancestor(env.initial_tenant, first_branch) == first_reparenting_response + ) client.tenant_delete(env.initial_tenant) @@ -584,10 +615,446 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): assert e.value.status_code == 404 +@pytest.mark.parametrize("sharded", [True, False]) +def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, sharded: bool): + # the test is split from test_timeline_ancestor_detach_idempotent_success as only these error cases should create "request was dropped before completing", + # given the current first error handling + shards = 2 if sharded else 1 + + neon_env_builder.num_pageservers = shards + env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for ps in pageservers.values(): + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + ps.allowed_errors.append( + ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing" + ) + + client = ( + env.pageserver.http_client() if not sharded else env.storage_controller.pageserver_api() + ) + + with pytest.raises(PageserverApiException, match=".* no ancestors") as info: + client.detach_ancestor(env.initial_tenant, env.initial_timeline) + assert info.value.status_code == 409 + + _ = env.neon_cli.create_branch("first_branch") + + second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + + # funnily enough this does not have a prefix + with pytest.raises(PageserverApiException, match="too many ancestors") as info: + client.detach_ancestor(env.initial_tenant, second_branch) + assert info.value.status_code == 400 + + +def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): + """ + Sharded timeline detach ancestor; 4 nodes: 1 stuck, 1 restarted, 2 normal. + + Stuck node gets stuck on a pause failpoint for first storage controller request. + Restarted node remains stuck until explicit restart from test code. + + We retry the request until storage controller gets 200 OK from all nodes. + """ + branch_name = "soon_detached" + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + # FIXME: should this be in the neon_env_builder.init_start? + env.storage_controller.reconcile_until_idle() + # as we will stop a node, make sure there is no clever rebalancing + env.storage_controller.tenant_policy_update(env.initial_tenant, body={"scheduling": "Stop"}) + env.storage_controller.allowed_errors.append(".*: Scheduling is disabled by policy Stop .*") + + shards = env.storage_controller.locate(env.initial_tenant) + + utilized_pageservers = {x["node_id"] for x in shards} + assert len(utilized_pageservers) > 1, "all shards got placed on single pageserver?" + + branch_timeline_id = env.neon_cli.create_branch(branch_name, tenant_id=env.initial_tenant) + + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + ep.safe_psql( + "create table foo as select 1::bigint, i::bigint from generate_series(1, 10000) v(i)" + ) + lsn = flush_ep_to_pageserver(env, ep, env.initial_tenant, branch_timeline_id) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for shard_info in shards: + node_id = int(shard_info["node_id"]) + shard_id = shard_info["shard_id"] + detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id) + + assert Lsn(detail["last_record_lsn"]) >= lsn + assert Lsn(detail["initdb_lsn"]) < lsn + assert TimelineId(detail["ancestor_timeline_id"]) == env.initial_timeline + + # make one of the nodes get stuck, but continue the initial operation + # make another of the nodes get stuck, then restart + + stuck = pageservers[int(shards[0]["node_id"])] + log.info(f"stuck pageserver is id={stuck.id}") + stuck_http = stuck.http_client() + stuck_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause") + ) + + restarted = pageservers[int(shards[1]["node_id"])] + log.info(f"restarted pageserver is id={restarted.id}") + # this might be hit; see `restart_restarted` + restarted.allowed_errors.append(".*: Cancelled request finished with an error: ShuttingDown") + assert restarted.id != stuck.id + restarted_http = restarted.http_client() + restarted_http.configure_failpoints( + [ + ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause"), + ] + ) + + for info in shards: + pageserver = pageservers[int(info["node_id"])] + # the first request can cause these, but does not repeatedly + pageserver.allowed_errors.append(".*: request was dropped before completing") + + # first request again + env.storage_controller.allowed_errors.append(".*: request was dropped before completing") + + target = env.storage_controller.pageserver_api() + + with pytest.raises(ReadTimeout): + target.detach_ancestor(env.initial_tenant, branch_timeline_id, timeout=1) + + stuck_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off") + ) + + barrier = threading.Barrier(2) + + def restart_restarted(): + barrier.wait() + # graceful shutdown should just work, because simultaneously unpaused + restarted.stop() + # this does not happen always, depends how fast we exit after unpausing + # restarted.assert_log_contains("Cancelled request finished with an error: ShuttingDown") + restarted.start() + + with ThreadPoolExecutor(max_workers=1) as pool: + fut = pool.submit(restart_restarted) + barrier.wait() + # we have 10s, lets use 1/2 of that to help the shutdown start + time.sleep(5) + restarted_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off") + ) + fut.result() + + # detach ancestor request handling is not sensitive to http cancellation. + # this means that the "stuck" is on its way to complete the detach, but the restarted is off + # now it can either be complete on all nodes, or still in progress with + # one. + without_retrying = target.without_status_retrying() + + # this retry loop will be long enough that the tenant can always activate + reparented = None + for _ in range(10): + try: + reparented = without_retrying.detach_ancestor(env.initial_tenant, branch_timeline_id) + except PageserverApiException as info: + assert info.status_code == 503 + time.sleep(2) + else: + break + + assert reparented == [], "too many retries (None) or unexpected reparentings" + + for shard_info in shards: + node_id = int(shard_info["node_id"]) + shard_id = shard_info["shard_id"] + + # TODO: ensure quescing is done on pageserver? + pageservers[node_id].quiesce_tenants() + detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id) + wait_for_last_record_lsn( + pageservers[node_id].http_client(), shard_id, branch_timeline_id, lsn + ) + assert detail.get("ancestor_timeline_id") is None + + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + count = int(ep.safe_psql("select count(*) from foo")[0][0]) + assert count == 10000 + + +@pytest.mark.parametrize("mode", ["delete_timeline", "delete_tenant"]) +@pytest.mark.parametrize("sharded", [False, True]) +def test_timeline_detach_ancestor_interrupted_by_deletion( + neon_env_builder: NeonEnvBuilder, mode: str, sharded: bool +): + """ + Timeline ancestor detach interrupted by deleting either: + - the detached timeline + - the whole tenant + + after starting the detach. + + What remains not tested by this: + - shutdown winning over complete + + Shutdown winning over complete needs gc blocking and reparenting any left-overs on retry. + """ + + if sharded and mode == "delete_tenant": + # the shared/exclusive lock for tenant is blocking this: + # timeline detach ancestor takes shared, delete tenant takes exclusive + pytest.skip( + "tenant deletion while timeline ancestor detach is underway is not supported yet" + ) + + shard_count = 2 if sharded else 1 + + neon_env_builder.num_pageservers = shard_count + + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count if sharded else None) + + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + detached_timeline = env.neon_cli.create_branch("detached soon", "main") + + failpoint = "timeline-detach-ancestor::before_starting_after_locking_pausable" + + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + + assert len(set(info["node_id"] for info in shards)) == shard_count + + target = env.storage_controller.pageserver_api() if sharded else env.pageserver.http_client() + target = target.without_status_retrying() + + victim = pageservers[int(shards[-1]["node_id"])] + victim_http = victim.http_client() + victim_http.configure_failpoints((failpoint, "pause")) + + def detach_ancestor(): + target.detach_ancestor(env.initial_tenant, detached_timeline) + + def at_failpoint() -> Tuple[str, LogCursor]: + return victim.assert_log_contains(f"at failpoint {failpoint}") + + def start_delete(): + if mode == "delete_timeline": + target.timeline_delete(env.initial_tenant, detached_timeline) + elif mode == "delete_tenant": + target.tenant_delete(env.initial_tenant) + else: + raise RuntimeError(f"unimplemented mode {mode}") + + def at_waiting_on_gate_close(start_offset: LogCursor) -> LogCursor: + _, offset = victim.assert_log_contains( + "closing is taking longer than expected", offset=start_offset + ) + return offset + + def is_deleted(): + try: + if mode == "delete_timeline": + target.timeline_detail(env.initial_tenant, detached_timeline) + elif mode == "delete_tenant": + target.tenant_status(env.initial_tenant) + else: + return False + except PageserverApiException as e: + assert e.status_code == 404 + return True + else: + raise RuntimeError("waiting for 404") + + with ThreadPoolExecutor(max_workers=2) as pool: + try: + fut = pool.submit(detach_ancestor) + _, offset = wait_until(10, 1.0, at_failpoint) + + delete = pool.submit(start_delete) + + wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) + + victim_http.configure_failpoints((failpoint, "off")) + + delete.result() + + assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}" + + with pytest.raises(PageserverApiException) as exc: + fut.result() + assert exc.value.status_code == 503 + finally: + victim_http.configure_failpoints((failpoint, "off")) + + +@pytest.mark.parametrize("mode", ["delete_reparentable_timeline"]) +def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnvBuilder, mode: str): + """ + Technically possible storage controller concurrent interleaving timeline + deletion with timeline detach. + + Deletion is fine, as any sharded pageservers reach the same end state, but + creating reparentable timeline would create an issue as the two nodes would + never agree. There is a solution though: the created reparentable timeline + must be detached. + """ + + assert ( + mode == "delete_reparentable_timeline" + ), "only one now, but we could have the create just as well, need gc blocking" + + shard_count = 2 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + assert len(set(x["node_id"] for x in shards)) == shard_count + + with env.endpoints.create_start("main") as ep: + ep.safe_psql("create table foo as select i::bigint from generate_series(1, 1000) t(i)") + + # as the interleaved operation, we will delete this timeline, which was reparenting candidate + first_branch_lsn = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]: + ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline) + + ep.safe_psql("create table bar as select i::bigint from generate_series(1, 2000) t(i)") + detached_branch_lsn = flush_ep_to_pageserver( + env, ep, env.initial_tenant, env.initial_timeline + ) + + for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]: + ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline) + + first_branch = env.neon_cli.create_branch( + "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn + ) + detached_branch = env.neon_cli.create_branch( + "detached_branch", ancestor_branch_name="main", ancestor_start_lsn=detached_branch_lsn + ) + + pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable" + + stuck = pageservers[int(shards[0]["node_id"])] + stuck_http = stuck.http_client().without_status_retrying() + stuck_http.configure_failpoints((pausepoint, "pause")) + + victim = pageservers[int(shards[-1]["node_id"])] + victim_http = victim.http_client().without_status_retrying() + victim_http.configure_failpoints( + (pausepoint, "pause"), + ) + + # noticed a surprising 409 if the other one would fail instead + # victim_http.configure_failpoints([ + # (pausepoint, "pause"), + # ("timeline-detach-ancestor::before_starting_after_locking", "return"), + # ]) + + # interleaving a create_timeline which could be reparented will produce two + # permanently different reparentings: one node has reparented, other has + # not + # + # with deletion there is no such problem + def detach_timeline(): + env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, detached_branch) + + def paused_at_failpoint(): + stuck.assert_log_contains(f"at failpoint {pausepoint}") + victim.assert_log_contains(f"at failpoint {pausepoint}") + + def first_completed(): + detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch) + log.info(detail) + assert detail.get("ancestor_lsn") is None + + def first_branch_gone(): + try: + env.storage_controller.pageserver_api().timeline_detail( + env.initial_tenant, first_branch + ) + except PageserverApiException as e: + log.info(f"error {e}") + assert e.status_code == 404 + else: + log.info("still ok") + raise RuntimeError("not done yet") + + with ThreadPoolExecutor(max_workers=1) as pool: + try: + fut = pool.submit(detach_timeline) + wait_until(10, 1.0, paused_at_failpoint) + + # let stuck complete + stuck_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_completed) + + # if we would let victim fail, for some reason there'd be a 409 response instead of 500 + # victim_http.configure_failpoints((pausepoint, "off")) + # with pytest.raises(PageserverApiException, match=".* 500 Internal Server Error failpoint: timeline-detach-ancestor::before_starting_after_locking") as exc: + # fut.result() + # assert exc.value.status_code == 409 + + env.storage_controller.pageserver_api().timeline_delete( + env.initial_tenant, first_branch + ) + victim_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_branch_gone) + + # it now passes, and we should get an error messages about mixed reparenting as the stuck still had something to reparent + fut.result() + + msg, offset = env.storage_controller.assert_log_contains( + ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*" + ) + log.info(f"expected error message: {msg}") + env.storage_controller.allowed_errors.append( + ".*: shards returned different results matching=0 .*" + ) + + detach_timeline() + + # FIXME: perhaps the above should be automatically retried, if we get mixed results? + not_found = env.storage_controller.log_contains( + ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*", + offset=offset, + ) + + assert not_found is None + finally: + stuck_http.configure_failpoints((pausepoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) + + # TODO: -# - after starting the operation, tenant is deleted # - after starting the operation, pageserver is shutdown, restarted # - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited # - deletion of reparented while reparenting should fail once, then succeed (?) # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. +# +# TEST: 1. tad which partially succeeds, one returns 500 +# 2. create branch below timeline? ~or delete reparented timeline~ (done) +# 3. on retry all should report the same reparented timelines diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 7efd86e349..f02f19c588 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -147,8 +147,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): last_record_lsn=Lsn(timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: - m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)])) - m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)])) + m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id)))) + m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id)))) for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): # Invariant. May be < when transaction is in progress. @@ -2065,6 +2065,11 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int): log.info(f"Original digest: {orig_digest}") for sk in env.safekeepers: + wait( + partial(is_flush_lsn_caught_up, sk, tenant_id, timeline_id, lsn), + f"sk_id={sk.id} to flush {lsn}", + ) + sk.http_client().copy_timeline( tenant_id, timeline_id, @@ -2237,6 +2242,8 @@ def test_s3_eviction( check_values = [0] * n_timelines + event_metrics_seen = False + n_iters = 20 for _ in range(n_iters): if log.isEnabledFor(logging.DEBUG): @@ -2261,6 +2268,27 @@ def test_s3_eviction( # update remote_consistent_lsn on pageserver ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True) + # Do metrics check before restarts, since these will reset to zero across a restart + event_metrics_seen |= any( + sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "restore"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "restore"} + ) + or 0 > 0 + for sk in env.safekeepers + ) + # restarting random safekeepers for sk in env.safekeepers: if random.random() < restart_chance: @@ -2274,3 +2302,5 @@ def test_s3_eviction( and sk.log_contains("successfully restored evicted timeline") for sk in env.safekeepers ) + + assert event_metrics_seen diff --git a/trace/Cargo.toml b/trace/Cargo.toml deleted file mode 100644 index d6eed3f49c..0000000000 --- a/trace/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] -name = "trace" -version = "0.1.0" -edition.workspace = true -license.workspace = true - -[dependencies] -clap.workspace = true -anyhow.workspace = true - -pageserver_api.workspace = true -utils.workspace = true -workspace_hack.workspace = true diff --git a/trace/src/main.rs b/trace/src/main.rs deleted file mode 100644 index 79e1df988d..0000000000 --- a/trace/src/main.rs +++ /dev/null @@ -1,167 +0,0 @@ -//! A tool for working with read traces generated by the pageserver. -use std::collections::HashMap; -use std::path::PathBuf; -use std::str::FromStr; -use std::{ - fs::{read_dir, File}, - io::BufReader, -}; - -use pageserver_api::models::{ - PagestreamFeMessage, PagestreamGetPageRequest, PagestreamProtocolVersion, -}; -use utils::id::{ConnectionId, TenantId, TimelineId}; - -use clap::{Parser, Subcommand}; - -/// Utils for working with pageserver read traces. For generating -/// traces, see the `trace_read_requests` tenant config option. -#[derive(Parser, Debug)] -#[command(author, version, about, long_about = None)] -struct Args { - /// Path of trace directory - #[arg(short, long)] - path: PathBuf, - - #[command(subcommand)] - command: Command, -} - -/// What to do with the read trace -#[derive(Subcommand, Debug)] -enum Command { - /// List traces in the directory - List, - - /// Print the traces in text format - Dump, - - /// Print stats and anomalies about the traces - Analyze, -} - -// HACK This function will change and improve as we see what kind of analysis is useful. -// Currently it collects the difference in blkno of consecutive GetPage requests, -// and counts the frequency of each value. This information is useful in order to: -// - see how sequential a workload is by seeing how often the delta is 1 -// - detect any prefetching anomalies by looking for negative deltas during seqscan -fn analyze_trace(mut reader: R) { - let mut total = 0; // Total requests traced - let mut cross_rel = 0; // Requests that ask for different rel than previous request - let mut deltas = HashMap::::new(); // Consecutive blkno differences - let mut prev: Option = None; - - // Compute stats - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) { - match msg { - PagestreamFeMessage::Exists(_) => {} - PagestreamFeMessage::Nblocks(_) => {} - PagestreamFeMessage::GetSlruSegment(_) => {} - PagestreamFeMessage::GetPage(req) => { - total += 1; - - if let Some(prev) = prev { - if prev.rel == req.rel { - let delta = (req.blkno as i32) - (prev.blkno as i32); - deltas.entry(delta).and_modify(|c| *c += 1).or_insert(1); - } else { - cross_rel += 1; - } - } - prev = Some(req); - } - PagestreamFeMessage::DbSize(_) => {} - }; - } - - // Print stats. - let mut other = deltas.len(); - deltas.retain(|_, count| *count > 300); - other -= deltas.len(); - dbg!(total); - dbg!(cross_rel); - dbg!(other); - dbg!(deltas); -} - -fn dump_trace(mut reader: R) { - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) { - println!("{msg:?}"); - } -} - -#[derive(Debug)] -struct TraceFile { - #[allow(dead_code)] - pub tenant_id: TenantId, - - #[allow(dead_code)] - pub timeline_id: TimelineId, - - #[allow(dead_code)] - pub connection_id: ConnectionId, - - pub path: PathBuf, -} - -fn get_trace_files(traces_dir: &PathBuf) -> anyhow::Result> { - let mut trace_files = Vec::::new(); - - // Trace files are organized as {tenant_id}/{timeline_id}/{connection_id} - for tenant_dir in read_dir(traces_dir)? { - let entry = tenant_dir?; - let path = entry.path(); - let tenant_id = TenantId::from_str(path.file_name().unwrap().to_str().unwrap())?; - - for timeline_dir in read_dir(path)? { - let entry = timeline_dir?; - let path = entry.path(); - let timeline_id = TimelineId::from_str(path.file_name().unwrap().to_str().unwrap())?; - - for trace_dir in read_dir(path)? { - let entry = trace_dir?; - let path = entry.path(); - let connection_id = - ConnectionId::from_str(path.file_name().unwrap().to_str().unwrap())?; - - trace_files.push(TraceFile { - tenant_id, - timeline_id, - connection_id, - path, - }); - } - } - } - - Ok(trace_files) -} - -fn main() -> anyhow::Result<()> { - let args = Args::parse(); - - match args.command { - Command::List => { - for trace_file in get_trace_files(&args.path)? { - println!("{trace_file:?}"); - } - } - Command::Dump => { - for trace_file in get_trace_files(&args.path)? { - let file = File::open(trace_file.path.clone())?; - let reader = BufReader::new(file); - dump_trace(reader); - } - } - Command::Analyze => { - for trace_file in get_trace_files(&args.path)? { - println!("analyzing {trace_file:?}"); - let file = File::open(trace_file.path.clone())?; - let reader = BufReader::new(file); - analyze_trace(reader); - } - } - } - - Ok(()) -} diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index ad73770c44..dbd0e6428b 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit ad73770c446ea361f43e4f0404798b7e5e7a62d8 +Subproject commit dbd0e6428b9274d72a10ac29bd3e3162faf109d4 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 4874c8e52e..035b73a9c5 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 4874c8e52ed349a9f8290bbdcd91eb92677a5d24 +Subproject commit 035b73a9c5998f9a0ef35cc8df1bae680bf770fc diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index b810fdfcbb..b39f316137 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2 +Subproject commit b39f316137fdd29e2da15d2af2fdd1cfd18163be diff --git a/vendor/revisions.json b/vendor/revisions.json index da49ff19c3..eeebd646f5 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "v16": ["16.3", "b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2"], - "v15": ["15.7", "4874c8e52ed349a9f8290bbdcd91eb92677a5d24"], - "v14": ["14.12", "ad73770c446ea361f43e4f0404798b7e5e7a62d8"] + "v16": ["16.3", "b39f316137fdd29e2da15d2af2fdd1cfd18163be"], + "v15": ["15.7", "035b73a9c5998f9a0ef35cc8df1bae680bf770fc"], + "v14": ["14.12", "dbd0e6428b9274d72a10ac29bd3e3162faf109d4"] } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 3c446ecdea..224e9847f3 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -236,6 +236,7 @@ files: query: | select sum(pg_database_size(datname)) as total from pg_database; + # DEPRECATED - metric_name: lfc_approximate_working_set_size type: gauge help: 'Approximate working set size in pages of 8192 bytes' @@ -244,6 +245,20 @@ files: query: | select neon.approximate_working_set_size(false) as approximate_working_set_size; + - metric_name: lfc_approximate_working_set_size_windows + type: gauge + help: 'Approximate working set size in pages of 8192 bytes' + key_labels: [duration] + values: [size] + # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection + # of durations in a pretty-printed form. + query: | + select + x as duration, + neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size + from + (values ('5m'),('15m'),('1h')) as t (x); + - metric_name: current_lsn type: gauge help: 'Current LSN of the database' @@ -377,13 +392,19 @@ files: query: | select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - - metric_name: lfc_approximate_working_set_size + - metric_name: lfc_approximate_working_set_size_windows type: gauge help: 'Approximate working set size in pages of 8192 bytes' - key_labels: - values: [approximate_working_set_size] + key_labels: [duration_seconds] + values: [size] + # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set + # size looking back 1..60 minutes, labeled with the number of minutes. query: | - select neon.approximate_working_set_size(false) as approximate_working_set_size; + select + x::text as duration_seconds, + neon.approximate_working_set_size_seconds(x) as size + from + (select generate_series * 60 as x from generate_series(1, 60)); build: | # Build cgroup-tools #