diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 37983798b7..d27fa01efa 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -8,6 +8,8 @@ self-hosted-runner: - small-arm64 - us-east-2 config-variables: + - BENCHMARK_PROJECT_ID_PUB + - BENCHMARK_PROJECT_ID_SUB - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_UPCOMING_RELEASE_CHANNEL_ID diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index d4029bd37c..f4a194639f 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -14,11 +14,8 @@ inputs: api_host: description: 'Neon API host' default: console-stage.neon.build - provisioner: - description: 'k8s-pod or k8s-neonvm' - default: 'k8s-pod' compute_units: - description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' + description: '[Min, Max] compute units' default: '[1, 1]' outputs: @@ -37,10 +34,6 @@ runs: # A shell without `set -x` to not to expose password/dsn in logs shell: bash -euo pipefail {0} run: | - if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then - echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU" - fi - project=$(curl \ "https://${API_HOST}/api/v2/projects" \ --fail \ @@ -52,7 +45,7 @@ runs: \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", \"pg_version\": ${POSTGRES_VERSION}, \"region_id\": \"${REGION_ID}\", - \"provisioner\": \"${PROVISIONER}\", + \"provisioner\": \"k8s-neonvm\", \"autoscaling_limit_min_cu\": ${MIN_CU}, \"autoscaling_limit_max_cu\": ${MAX_CU}, \"settings\": { } @@ -75,6 +68,5 @@ runs: API_KEY: ${{ inputs.api_key }} REGION_ID: ${{ inputs.region_id }} POSTGRES_VERSION: ${{ inputs.postgres_version }} - PROVISIONER: ${{ inputs.provisioner }} MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index daaedf6d11..9d39ab6ad7 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -131,8 +131,8 @@ runs: exit 1 fi if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then - # -n16 uses sixteen processes to run tests via pytest-xdist - EXTRA_PARAMS="-n16 $EXTRA_PARAMS" + # -n sets the number of parallel processes that pytest-xdist will run + EXTRA_PARAMS="-n12 $EXTRA_PARAMS" # --dist=loadgroup points tests marked with @pytest.mark.xdist_group # to the same worker to make @pytest.mark.order work with xdist diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 35c6251304..a0ed169024 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -19,6 +19,10 @@ on: description: 'debug or release' required: true type: string + pg-versions: + description: 'a json array of postgres versions to run regression tests on' + required: true + type: string defaults: run: @@ -254,7 +258,7 @@ jobs: strategy: fail-fast: false matrix: - pg_version: [ v14, v15, v16 ] + pg_version: ${{ fromJson(inputs.pg-versions) }} steps: - uses: actions/checkout@v4 with: @@ -278,14 +282,11 @@ jobs: CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_IMPL: vectored - PAGESERVER_GET_IMPL: vectored - PAGESERVER_VALIDATE_VEC_GET: true # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 - name: Merge and upload coverage data if: | false && - inputs.build-type == 'debug' && matrix.pg_version == 'v14' + inputs.build-type == 'debug' && matrix.pg_version == 'v16' uses: ./.github/actions/save-coverage-data diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index c132b5b513..0f4dac841e 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -63,11 +63,9 @@ jobs: - DEFAULT_PG_VERSION: 16 PLATFORM: "neon-staging" region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} - provisioner: 'k8s-pod' - DEFAULT_PG_VERSION: 16 PLATFORM: "azure-staging" region_id: 'azure-eastus2' - provisioner: 'k8s-neonvm' env: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" @@ -100,7 +98,6 @@ jobs: region_id: ${{ matrix.region_id }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - provisioner: ${{ matrix.provisioner }} - name: Run benchmark uses: ./.github/actions/run-python-test-set @@ -150,7 +147,7 @@ jobs: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -171,7 +168,7 @@ jobs: path: /tmp/neon/ prefix: latest - - name: Run benchmark + - name: Run Logical Replication benchmarks uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} @@ -179,12 +176,15 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }} + BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }} - - name: Run benchmark + - name: Run Physical Replication benchmarks uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} @@ -216,11 +216,11 @@ jobs: # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) # # Available platforms: - # - neon-captest-new: Freshly created project (1 CU) - # - neon-captest-freetier: Use freetier-sized compute (0.25 CU) + # - neonvm-captest-new: Freshly created project (1 CU) + # - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU) # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region - # - neon-captest-reuse: Reusing existing project + # - neonvm-captest-reuse: Reusing existing project # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage env: @@ -245,24 +245,21 @@ jobs: "'"$region_id_default"'" ], "platform": [ - "neon-captest-new", - "neon-captest-reuse", + "neonvm-captest-new", + "neonvm-captest-reuse", "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" }, + "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] }' if [ "$(date +%A)" = "Saturday" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}, - { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "50gb"}]') + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -272,7 +269,7 @@ jobs: run: | matrix='{ "platform": [ - "neon-captest-reuse" + "neonvm-captest-reuse" ] }' @@ -288,7 +285,7 @@ jobs: run: | matrix='{ "platform": [ - "neon-captest-reuse" + "neonvm-captest-reuse" ], "scale": [ "10" @@ -339,7 +336,7 @@ jobs: prefix: latest - name: Create Neon Project - if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) + if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -347,19 +344,18 @@ jobs: postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }} - provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }} - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; neonvm-captest-sharding-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} ;; - neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) + neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -443,9 +439,9 @@ jobs: fail-fast: false matrix: include: - - PLATFORM: "neon-captest-pgvector" + - PLATFORM: "neonvm-captest-pgvector" - PLATFORM: "azure-captest-pgvector" - + env: TEST_PG_BENCH_DURATIONS_MATRIX: "15m" TEST_PG_BENCH_SCALES_MATRIX: "1" @@ -487,7 +483,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-pgvector) + neonvm-captest-pgvector) CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} ;; azure-captest-pgvector) @@ -586,7 +582,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }} ;; rds-aurora) @@ -596,7 +592,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }} ;; *) - echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -673,7 +669,7 @@ jobs: - name: Get Connstring Secret Name run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) ENV_PLATFORM=CAPTEST_TPCH ;; rds-aurora) @@ -683,7 +679,7 @@ jobs: ENV_PLATFORM=RDS_AURORA_TPCH ;; *) - echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -760,7 +756,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} ;; rds-aurora) @@ -770,7 +766,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }} ;; *) - echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index a69686bf2a..76fc58151a 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -72,6 +72,12 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} + - uses: docker/build-push-action@v6 with: context: . @@ -79,8 +85,8 @@ jobs: push: true pull: true file: Dockerfile.build-tools - cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/build-tools:cache-{0},mode=max', matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }} tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} - name: Remove custom docker config directory diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d4af174fc5..c7ae2aedd4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -203,7 +203,8 @@ jobs: fail-fast: false matrix: arch: [ x64 ] - build-type: [ debug, release ] + # Do not build or run tests in debug for release branches + build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }} include: - build-type: release arch: arm64 @@ -213,6 +214,8 @@ jobs: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }} build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} + # Run tests on all Postgres versions in release builds and only on the latest version in debug builds + pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }} secrets: inherit # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking @@ -286,9 +289,6 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_IMPL: vectored - PAGESERVER_GET_IMPL: vectored - PAGESERVER_VALIDATE_VEC_GET: false # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -309,7 +309,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} create-test-report: - needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ] + needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} outputs: report-url: ${{ steps.create-allure-report.outputs.report-url }} @@ -499,6 +499,12 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} + - uses: docker/build-push-action@v6 with: context: . @@ -510,9 +516,8 @@ jobs: push: true pull: true file: Dockerfile - cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }} - # 23.07.2024 temporarily disable cache saving in the registry as it is very slow - # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }} tags: | neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} @@ -591,6 +596,12 @@ jobs: username: ${{ secrets.AWS_ACCESS_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }} + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} + - name: Build compute-node image uses: docker/build-push-action@v6 with: @@ -604,9 +615,8 @@ jobs: push: true pull: true file: Dockerfile.compute-node - cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} - # 23.07.2024 temporarily disable cache saving in the registry as it is very slow - # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} tags: | neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} @@ -625,9 +635,8 @@ jobs: pull: true file: Dockerfile.compute-node target: neon-pg-ext-test - cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }} - # 23.07.2024 temporarily disable cache saving in the registry as it is very slow - # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} tags: | neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} @@ -827,6 +836,9 @@ jobs: rm -rf .docker-custom promote-images: + permissions: + contents: read # This is required for actions/checkout + id-token: write # This is required for Azure Login to work. needs: [ check-permissions, tag, test-images, vm-compute-node-image ] runs-on: ubuntu-22.04 @@ -853,6 +865,28 @@ jobs: neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} done + - name: Azure login + if: github.ref_name == 'main' + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} + + - name: Login to ACR + if: github.ref_name == 'main' + run: | + az acr login --name=neoneastus2 + + - name: Copy docker images to ACR-dev + if: github.ref_name == 'main' + run: | + for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do + docker buildx imagetools create \ + -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/${image}:${{ needs.tag.outputs.build-tag }} + done + - name: Add latest tag to images if: github.ref_name == 'main' run: | diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index e21e45c929..23a2e3876c 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -13,6 +13,7 @@ on: paths: - '.github/workflows/pg-clients.yml' - 'test_runner/pg_clients/**' + - 'test_runner/logical_repl/**' - 'poetry.lock' workflow_dispatch: @@ -49,6 +50,101 @@ jobs: image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} secrets: inherit + test-logical-replication: + needs: [ build-build-tools-image ] + runs-on: ubuntu-22.04 + + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init --user root + services: + clickhouse: + image: clickhouse/clickhouse-server:24.6.3.64 + ports: + - 9000:9000 + - 8123:8123 + zookeeper: + image: quay.io/debezium/zookeeper:2.7 + ports: + - 2181:2181 + kafka: + image: quay.io/debezium/kafka:2.7 + env: + ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + ports: + - 9092:9092 + debezium: + image: quay.io/debezium/connect:2.7 + env: + BOOTSTRAP_SERVERS: kafka:9092 + GROUP_ID: 1 + CONFIG_STORAGE_TOPIC: debezium-config + OFFSET_STORAGE_TOPIC: debezium-offset + STATUS_STORAGE_TOPIC: debezium-status + DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector + ports: + - 8083:8083 + steps: + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + postgres_version: ${{ env.DEFAULT_PG_VERSION }} + + - name: Run tests + uses: ./.github/actions/run-python-test-set + with: + build_type: remote + test_selection: logical_repl + run_in_parallel: false + extra_params: -m remote_cluster + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} + + - name: Delete Neon Project + if: always() + uses: ./.github/actions/neon-project-delete + with: + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + if: ${{ !cancelled() }} + id: create-allure-report + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - name: Post to a Slack channel + if: github.event.schedule && failure() + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>) + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + test-postgres-client-libs: needs: [ build-build-tools-image ] runs-on: ubuntu-22.04 diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index 024594532f..cf10910b0b 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -66,8 +66,22 @@ jobs: username: ${{ secrets.AWS_ACCESS_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR + - name: Azure login + if: steps.check-manifests.outputs.skip == 'false' + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} + + - name: Login to ACR + if: steps.check-manifests.outputs.skip == 'false' + run: | + az acr login --name=neoneastus2 + + - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR if: steps.check-manifests.outputs.skip == 'false' run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ + -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \ neondatabase/build-tools:${FROM_TAG} diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 77928a343e..6fbe785c56 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -13,8 +13,6 @@ defaults: env: # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} jobs: cancel-previous-e2e-tests: @@ -64,19 +62,35 @@ jobs: needs: [ tag ] runs-on: ubuntu-22.04 env: + EVENT_ACTION: ${{ github.event.action }} + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} TAG: ${{ needs.tag.outputs.build-tag }} steps: - - name: check if ecr image are present - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + - name: Wait for `promote-images` job to finish + # It's important to have a timeout here, the script in the step can run infinitely + timeout-minutes: 60 run: | - for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do - OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text) - if [ "$OUTPUT" == "" ]; then - echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT - exit 1 - fi + if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then + exit 0 + fi + + # For PRs we use the run id as the tag + BUILD_AND_TEST_RUN_ID=${TAG} + while true; do + conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion') + case "$conclusion" in + success) + break + ;; + failure | cancelled | skipped) + echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..." + exit 1 + ;; + *) + echo "The 'promote-images' hasn't succeed yet. Waiting..." + sleep 60 + ;; + esac done - name: Set e2e-platforms diff --git a/CODEOWNERS b/CODEOWNERS index af2fa6088e..606dbb4e22 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,13 +1,13 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute /storage_controller @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage -/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers +/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage /libs/remote_storage/ @neondatabase/storage -/libs/safekeeper_api/ @neondatabase/safekeepers +/libs/safekeeper_api/ @neondatabase/storage /libs/vm_monitor/ @neondatabase/autoscaling /pageserver/ @neondatabase/storage /pgxn/ @neondatabase/compute -/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers +/pgxn/neon/ @neondatabase/compute @neondatabase/storage /proxy/ @neondatabase/proxy -/safekeeper/ @neondatabase/safekeepers +/safekeeper/ @neondatabase/storage /vendor/ @neondatabase/compute diff --git a/Cargo.lock b/Cargo.lock index 04adccba7f..031fae0f37 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1418,7 +1418,7 @@ dependencies = [ "clap", "criterion-plot", "is-terminal", - "itertools", + "itertools 0.10.5", "num-traits", "once_cell", "oorandom", @@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", - "itertools", + "itertools 0.10.5", ] [[package]] @@ -1672,6 +1672,7 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b" dependencies = [ "bitflags 2.4.1", "byteorder", + "chrono", "diesel_derives", "itoa", "pq-sys", @@ -2133,6 +2134,12 @@ dependencies = [ "slab", ] +[[package]] +name = "gen_ops" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a" + [[package]] name = "generic-array" version = "0.14.7" @@ -2709,17 +2716,6 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" -[[package]] -name = "io-lifetimes" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" -dependencies = [ - "hermit-abi", - "libc", - "windows-sys 0.48.0", -] - [[package]] name = "io-uring" version = "0.6.2" @@ -2738,14 +2734,13 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" [[package]] name = "is-terminal" -version = "0.4.7" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" dependencies = [ "hermit-abi", - "io-lifetimes", - "rustix 0.37.25", - "windows-sys 0.48.0", + "libc", + "windows-sys 0.52.0", ] [[package]] @@ -2757,6 +2752,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.6" @@ -2871,18 +2875,6 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" -[[package]] -name = "linux-raw-sys" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" - -[[package]] -name = "linux-raw-sys" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" - [[package]] name = "linux-raw-sys" version = "0.4.13" @@ -3000,7 +2992,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec" dependencies = [ "libc", "measured", - "procfs 0.16.0", + "procfs", ] [[package]] @@ -3045,7 +3037,7 @@ dependencies = [ "measured", "measured-process", "once_cell", - "procfs 0.14.2", + "procfs", "prometheus", "rand 0.8.5", "rand_distr", @@ -3574,7 +3566,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper 0.14.26", - "itertools", + "itertools 0.10.5", "leaky-bucket", "md5", "metrics", @@ -3592,8 +3584,9 @@ dependencies = [ "postgres_connection", "postgres_ffi", "pq_proto", - "procfs 0.14.2", + "procfs", "rand 0.8.5", + "range-set-blaze", "regex", "remote_storage", "reqwest 0.12.4", @@ -3644,7 +3637,7 @@ dependencies = [ "hex", "humantime", "humantime-serde", - "itertools", + "itertools 0.10.5", "postgres_ffi", "rand 0.8.5", "serde", @@ -3702,7 +3695,7 @@ dependencies = [ "hex-literal", "humantime", "humantime-serde", - "itertools", + "itertools 0.10.5", "metrics", "once_cell", "pageserver_api", @@ -4034,7 +4027,7 @@ name = "postgres_connection" version = "0.1.0" dependencies = [ "anyhow", - "itertools", + "itertools 0.10.5", "once_cell", "postgres", "tokio-postgres", @@ -4092,7 +4085,7 @@ version = "0.1.0" dependencies = [ "byteorder", "bytes", - "itertools", + "itertools 0.10.5", "pin-project-lite", "postgres-protocol", "rand 0.8.5", @@ -4138,21 +4131,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "procfs" -version = "0.14.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69" -dependencies = [ - "bitflags 1.3.2", - "byteorder", - "chrono", - "flate2", - "hex", - "lazy_static", - "rustix 0.36.16", -] - [[package]] name = "procfs" version = "0.16.0" @@ -4160,10 +4138,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" dependencies = [ "bitflags 2.4.1", + "chrono", + "flate2", "hex", "lazy_static", "procfs-core", - "rustix 0.38.28", + "rustix", ] [[package]] @@ -4173,14 +4153,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" dependencies = [ "bitflags 2.4.1", + "chrono", "hex", ] [[package]] name = "prometheus" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" dependencies = [ "cfg-if", "fnv", @@ -4188,7 +4169,7 @@ dependencies = [ "libc", "memchr", "parking_lot 0.12.1", - "procfs 0.14.2", + "procfs", "thiserror", ] @@ -4210,7 +4191,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", "heck 0.4.1", - "itertools", + "itertools 0.10.5", "lazy_static", "log", "multimap", @@ -4231,7 +4212,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" dependencies = [ "anyhow", - "itertools", + "itertools 0.10.5", "proc-macro2", "quote", "syn 1.0.109", @@ -4288,7 +4269,7 @@ dependencies = [ "hyper-util", "indexmap 2.0.1", "ipnet", - "itertools", + "itertools 0.10.5", "lasso", "md5", "measured", @@ -4343,6 +4324,7 @@ dependencies = [ "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", + "try-lock", "typed-json", "url", "urlencoding", @@ -4464,6 +4446,18 @@ dependencies = [ "rand_core 0.5.1", ] +[[package]] +name = "range-set-blaze" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2" +dependencies = [ + "gen_ops", + "itertools 0.12.1", + "num-integer", + "num-traits", +] + [[package]] name = "rayon" version = "1.7.0" @@ -4632,7 +4626,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper 0.14.26", - "itertools", + "itertools 0.10.5", "metrics", "once_cell", "pin-project-lite", @@ -4942,34 +4936,6 @@ dependencies = [ "nom", ] -[[package]] -name = "rustix" -version = "0.36.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab" -dependencies = [ - "bitflags 1.3.2", - "errno", - "io-lifetimes", - "libc", - "linux-raw-sys 0.1.4", - "windows-sys 0.45.0", -] - -[[package]] -name = "rustix" -version = "0.37.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035" -dependencies = [ - "bitflags 1.3.2", - "errno", - "io-lifetimes", - "libc", - "linux-raw-sys 0.3.8", - "windows-sys 0.48.0", -] - [[package]] name = "rustix" version = "0.38.28" @@ -5718,6 +5684,7 @@ dependencies = [ "aws-config", "bytes", "camino", + "chrono", "clap", "control_plane", "diesel", @@ -5728,7 +5695,7 @@ dependencies = [ "hex", "humantime", "hyper 0.14.26", - "itertools", + "itertools 0.10.5", "lasso", "measured", "metrics", @@ -5737,6 +5704,7 @@ dependencies = [ "pageserver_client", "postgres_connection", "r2d2", + "rand 0.8.5", "reqwest 0.12.4", "routerify", "scopeguard", @@ -5792,9 +5760,10 @@ dependencies = [ "either", "futures", "futures-util", + "git-version", "hex", "humantime", - "itertools", + "itertools 0.10.5", "once_cell", "pageserver", "pageserver_api", @@ -5971,15 +5940,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.5.0" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" dependencies = [ "cfg-if", - "fastrand 1.9.0", - "redox_syscall 0.3.5", - "rustix 0.37.25", - "windows-sys 0.45.0", + "fastrand 2.0.0", + "redox_syscall 0.4.1", + "rustix", + "windows-sys 0.52.0", ] [[package]] @@ -6595,9 +6564,9 @@ dependencies = [ [[package]] name = "try-lock" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "tungstenite" @@ -7176,15 +7145,6 @@ dependencies = [ "windows_x86_64_msvc 0.42.2", ] -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets 0.42.2", -] - [[package]] name = "windows-sys" version = "0.48.0" @@ -7203,21 +7163,6 @@ dependencies = [ "windows-targets 0.52.4", ] -[[package]] -name = "windows-targets" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" -dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", -] - [[package]] name = "windows-targets" version = "0.48.0" @@ -7447,7 +7392,7 @@ dependencies = [ "hmac", "hyper 0.14.26", "indexmap 1.9.3", - "itertools", + "itertools 0.10.5", "libc", "log", "memchr", diff --git a/Cargo.toml b/Cargo.toml index 7749378114..963841e340 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] } parquet_derive = "51.0.0" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" -procfs = "0.14" +procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.11" rand = "0.8" @@ -184,6 +184,7 @@ tracing = "0.1" tracing-error = "0.2.0" tracing-opentelemetry = "0.21.0" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } +try-lock = "0.2.5" twox-hash = { version = "1.6.3", default-features = false } typed-json = "0.1" url = "2.2" diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 4826b7914e..dfaab1cb2e 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -192,7 +192,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.79.0 +ENV RUSTC_VERSION=1.80.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 48a52bfc6d..054d44e0ec 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -657,7 +657,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ - cargo install --locked --version 0.10.2 cargo-pgrx && \ + cargo install --locked --version 0.11.3 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root @@ -672,10 +672,15 @@ USER root FROM rust-extensions-build AS pg-jsonschema-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \ - echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ + echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 + # `unsafe-postgres` feature allows to build pgx extensions + # against postgres forks that decided to change their ABI name (like us). + # With that we can build extensions without forking them and using stock + # pgx. As this feature is new few manual version bumps were required. + sed -i 's/pgrx = "0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control @@ -689,10 +694,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar. FROM rust-extensions-build AS pg-graphql-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \ - echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ + echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ # it's needed to enable extension because it uses untrusted C language sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ @@ -712,6 +717,9 @@ ARG PG_VERSION RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ + # TODO update pgrx version in the pg_tiktoken repo and remove this line + sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \ + sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control @@ -725,14 +733,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6 FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION -RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \ - echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ + echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ - echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \ - wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ - patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ - echo "********************************************************************************************************" && \ - sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control @@ -929,7 +933,8 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src -#COPY --from=rum-pg-build /rum.tar.gz /ext-src +COPY --from=rum-pg-build /rum.tar.gz /ext-src +COPY patches/rum.patch /ext-src #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src COPY --from=prefix-pg-build /prefix.tar.gz /ext-src @@ -941,7 +946,7 @@ COPY patches/pg_hintplan.patch /ext-src COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src COPY patches/pg_cron.patch /ext-src #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src -COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src +#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src @@ -956,6 +961,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \ rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ || exit 1; rm -f $f; done RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch +RUN cd /ext-src/rum-src && patch -p1 <../rum.patch # cmake is required for the h3 test RUN apt-get update && apt-get install -y cmake RUN patch -p1 < /ext-src/pg_hintplan.patch diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8ceb8f2ad2..8af0ed43ce 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -4,6 +4,11 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +default = [] +# Enables test specific features. +testing = [] + [dependencies] anyhow.workspace = true async-compression.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 91855d954d..5bd6897fe3 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -400,7 +400,15 @@ impl ComputeNode { pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let mut retry_period_ms = 500.0; let mut attempts = 0; - let max_attempts = 10; + const DEFAULT_ATTEMPTS: u16 = 10; + #[cfg(feature = "testing")] + let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") { + u16::from_str(&v).unwrap() + } else { + DEFAULT_ATTEMPTS + }; + #[cfg(not(feature = "testing"))] + let max_attempts = DEFAULT_ATTEMPTS; loop { let result = self.try_get_basebackup(compute_state, lsn); match result { diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index a272c306e7..bf8a27e550 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { for (var, val) in std::env::vars() { - if var.starts_with("NEON_PAGESERVER_") { + if var.starts_with("NEON_") { cmd = cmd.env(var, val); } } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 4bf1b29785..51e9a51a57 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -21,7 +21,9 @@ use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, }; -use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest}; +use pageserver_api::controller_api::{ + NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest, +}; use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo}; use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; @@ -1250,9 +1252,70 @@ async fn handle_start_all( exit(1); } } + + neon_start_status_check(env, retry_timeout).await?; + Ok(()) } +async fn neon_start_status_check( + env: &local_env::LocalEnv, + retry_timeout: &Duration, +) -> anyhow::Result<()> { + const RETRY_INTERVAL: Duration = Duration::from_millis(100); + const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5); + + if env.control_plane_api.is_none() { + return Ok(()); + } + + let storcon = StorageController::from_env(env); + + let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis(); + let notice_after_retries = retry_timeout.as_millis() / NOTICE_AFTER_RETRIES.as_millis(); + + println!("\nRunning neon status check"); + + for retry in 0..retries { + if retry == notice_after_retries { + println!("\nNeon status check has not passed yet, continuing to wait") + } + + let mut passed = true; + let mut nodes = storcon.node_list().await?; + let mut pageservers = env.pageservers.clone(); + + if nodes.len() != pageservers.len() { + continue; + } + + nodes.sort_by_key(|ps| ps.id); + pageservers.sort_by_key(|ps| ps.id); + + for (idx, pageserver) in pageservers.iter().enumerate() { + let node = &nodes[idx]; + if node.id != pageserver.id { + passed = false; + break; + } + + if !matches!(node.availability, NodeAvailabilityWrapper::Active) { + passed = false; + break; + } + } + + if passed { + println!("\nNeon started and passed status check"); + return Ok(()); + } + + tokio::time::sleep(RETRY_INTERVAL).await; + } + + anyhow::bail!("\nNeon passed status check") +} + async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let immediate = sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 3ac3ce21df..505d157efd 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -151,7 +151,10 @@ pub struct NeonBroker { pub struct NeonStorageControllerConf { /// Heartbeat timeout before marking a node offline #[serde(with = "humantime_serde")] - pub max_unavailable: Duration, + pub max_offline: Duration, + + #[serde(with = "humantime_serde")] + pub max_warming_up: Duration, /// Threshold for auto-splitting a tenant into shards pub split_threshold: Option, @@ -159,14 +162,16 @@ pub struct NeonStorageControllerConf { impl NeonStorageControllerConf { // Use a shorter pageserver unavailability interval than the default to speed up tests. - const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = - std::time::Duration::from_secs(10); + const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10); + + const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30); } impl Default for NeonStorageControllerConf { fn default() -> Self { Self { - max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL, + max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL, + max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL, split_threshold: None, } } @@ -509,7 +514,6 @@ impl LocalEnv { #[derive(serde::Serialize, serde::Deserialize)] // (allow unknown fields, unlike PageServerConf) struct PageserverConfigTomlSubset { - id: NodeId, listen_pg_addr: String, listen_http_addr: String, pg_auth_type: AuthType, @@ -521,18 +525,30 @@ impl LocalEnv { .with_context(|| format!("read {:?}", config_toml_path))?, ) .context("parse pageserver.toml")?; + let identity_toml_path = dentry.path().join("identity.toml"); + #[derive(serde::Serialize, serde::Deserialize)] + struct IdentityTomlSubset { + id: NodeId, + } + let identity_toml: IdentityTomlSubset = toml_edit::de::from_str( + &std::fs::read_to_string(&identity_toml_path) + .with_context(|| format!("read {:?}", identity_toml_path))?, + ) + .context("parse identity.toml")?; let PageserverConfigTomlSubset { - id: config_toml_id, listen_pg_addr, listen_http_addr, pg_auth_type, http_auth_type, } = config_toml; + let IdentityTomlSubset { + id: identity_toml_id, + } = identity_toml; let conf = PageServerConf { id: { anyhow::ensure!( - config_toml_id == id, - "id mismatch: config_toml.id={config_toml_id} id={id}", + identity_toml_id == id, + "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}", ); id }, diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index ba4f98d945..399b1c2653 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -127,10 +127,13 @@ impl PageServerNode { } // Apply the user-provided overrides - overrides.push( - toml_edit::ser::to_string_pretty(&conf) - .expect("we deserialized this from toml earlier"), - ); + overrides.push({ + let mut doc = + toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier"); + // `id` is written out to `identity.toml` instead of `pageserver.toml` + doc.remove("id").expect("it's part of the struct"); + doc.to_string() + }); // Turn `overrides` into a toml document. // TODO: above code is legacy code, it should be refactored to use toml_edit directly. diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index d7aedd711a..e054e9ee57 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -5,8 +5,9 @@ use crate::{ use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::{ controller_api::{ - NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse, - TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse, + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest, + TenantShardMigrateResponse, }, models::{ TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, @@ -353,8 +354,10 @@ impl StorageController { "--dev", "--database-url", &database_url, - "--max-unavailable-interval", - &humantime::Duration::from(self.config.max_unavailable).to_string(), + "--max-offline-interval", + &humantime::Duration::from(self.config.max_offline).to_string(), + "--max-warming-up-interval", + &humantime::Duration::from(self.config.max_warming_up).to_string(), ] .into_iter() .map(|s| s.to_string()) @@ -625,6 +628,15 @@ impl StorageController { .await } + pub async fn node_list(&self) -> anyhow::Result> { + self.dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await + } + #[instrument(skip(self))] pub async fn ready(&self) -> anyhow::Result<()> { self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None) diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index a00591afd0..10805a9952 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -78,7 +78,7 @@ for pg_version in 14 15 16; do docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/ rm -rf $TMPDIR # We are running tests now - if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ + if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt then cleanup diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh index c05fc159aa..58b2581197 100644 --- a/docker-compose/run-tests.sh +++ b/docker-compose/run-tests.sh @@ -1,15 +1,15 @@ #!/bin/bash set -x -cd /ext-src +cd /ext-src || exit 2 FAILED= -LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u) +LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u) for d in ${LIST} do - [ -d ${d} ] || continue + [ -d "${d}" ] || continue psql -c "select 1" >/dev/null || break - make -C ${d} installcheck || FAILED="${d} ${FAILED}" + USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" done [ -z "${FAILED}" ] && exit 0 -echo ${FAILED} +echo "${FAILED}" exit 1 \ No newline at end of file diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md index 3acb4e18cb..b6b90d90c2 100644 --- a/docs/synthetic-size.md +++ b/docs/synthetic-size.md @@ -21,9 +21,9 @@ implementation where we keep more data than we would need to, do not change the synthetic size or incur any costs to the user. The synthetic size is calculated for the whole project. It is not -straightforward to attribute size to individual branches. See "What is -the size of an individual branch?" for discussion on those -difficulties. +straightforward to attribute size to individual branches. See [What is +the size of an individual branch?](#what-is-the-size-of-an-individual-branch) +for a discussion of those difficulties. The synthetic size is designed to: @@ -40,8 +40,9 @@ The synthetic size is designed to: - logical size is the size of a branch *at a given point in time*. It's the total size of all tables in all databases, as you see with "\l+" in psql for example, plus the Postgres SLRUs and some - small amount of metadata. NOTE that currently, Neon does not include - the SLRUs and metadata in the logical size. See comment to `get_current_logical_size_non_incremental()`. + small amount of metadata. Note that currently, Neon does not include + the SLRUs and metadata in the logical size. Refer to the comment in + [`get_current_logical_size_non_incremental()`](/pageserver/src/pgdatadir_mapping.rs#L813-L814). - a "point in time" is defined as an LSN value. You can convert a timestamp to an LSN, but the storage internally works with LSNs. diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index d0e1eb6b28..a5b452da83 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -1,4 +1,6 @@ +use std::collections::HashSet; use std::str::FromStr; +use std::time::{Duration, Instant}; /// Request/response types for the storage controller /// API (`/control/v1` prefix). Implemented by the server @@ -150,11 +152,16 @@ impl UtilizationScore { } } -#[derive(Serialize, Deserialize, Clone, Copy, Debug)] +#[derive(Serialize, Clone, Copy, Debug)] #[serde(into = "NodeAvailabilityWrapper")] pub enum NodeAvailability { // Normal, happy state Active(UtilizationScore), + // Node is warming up, but we expect it to become available soon. Covers + // the time span between the re-attach response being composed on the storage controller + // and the first successful heartbeat after the processing of the re-attach response + // finishes on the pageserver. + WarmingUp(Instant), // Offline: Tenants shouldn't try to attach here, but they may assume that their // secondary locations on this node still exist. Newly added nodes are in this // state until we successfully contact them. @@ -164,7 +171,10 @@ pub enum NodeAvailability { impl PartialEq for NodeAvailability { fn eq(&self, other: &Self) -> bool { use NodeAvailability::*; - matches!((self, other), (Active(_), Active(_)) | (Offline, Offline)) + matches!( + (self, other), + (Active(_), Active(_)) | (Offline, Offline) | (WarmingUp(_), WarmingUp(_)) + ) } } @@ -176,6 +186,7 @@ impl Eq for NodeAvailability {} #[derive(Serialize, Deserialize, Clone, Copy, Debug)] pub enum NodeAvailabilityWrapper { Active, + WarmingUp, Offline, } @@ -185,6 +196,7 @@ impl From for NodeAvailability { // Assume the worst utilisation score to begin with. It will later be updated by // the heartbeats. NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()), + NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()), NodeAvailabilityWrapper::Offline => NodeAvailability::Offline, } } @@ -194,6 +206,7 @@ impl From for NodeAvailabilityWrapper { fn from(val: NodeAvailability) -> Self { match val { NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active, + NodeAvailability::WarmingUp(_) => NodeAvailabilityWrapper::WarmingUp, NodeAvailability::Offline => NodeAvailabilityWrapper::Offline, } } @@ -282,6 +295,42 @@ pub enum PlacementPolicy { #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateResponse {} +/// Metadata health record posted from scrubber. +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthRecord { + pub tenant_shard_id: TenantShardId, + pub healthy: bool, + pub last_scrubbed_at: chrono::DateTime, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthUpdateRequest { + pub healthy_tenant_shards: HashSet, + pub unhealthy_tenant_shards: HashSet, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthUpdateResponse {} + +#[derive(Serialize, Deserialize, Debug)] + +pub struct MetadataHealthListUnhealthyResponse { + pub unhealthy_tenant_shards: Vec, +} + +#[derive(Serialize, Deserialize, Debug)] + +pub struct MetadataHealthListOutdatedRequest { + #[serde(with = "humantime_serde")] + pub not_scrubbed_for: Duration, +} + +#[derive(Serialize, Deserialize, Debug)] + +pub struct MetadataHealthListOutdatedResponse { + pub health_records: Vec, +} + #[cfg(test)] mod test { use super::*; diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 0acd83753e..3af3f74e9c 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -107,7 +107,10 @@ impl Key { /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 pub fn to_i128(&self) -> i128 { - assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); + assert!( + self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222, + "invalid key: {self}", + ); (((self.field1 & 0x7F) as i128) << 120) | (((self.field2 & 0xFFFF) as i128) << 104) | ((self.field3 as i128) << 72) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 591c45d908..ab4adfbebe 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -637,6 +637,13 @@ pub struct TenantInfo { pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub attachment_status: TenantAttachmentStatus, pub generation: u32, + + /// Opaque explanation if gc is being blocked. + /// + /// Only looked up for the individual tenant detail, not the listing. This is purely for + /// debugging, not included in openapi. + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_blocking: Option, } #[derive(Serialize, Deserialize, Clone)] @@ -940,6 +947,8 @@ pub struct TopTenantShardsResponse { } pub mod virtual_file { + use std::path::PathBuf; + #[derive( Copy, Clone, @@ -958,6 +967,53 @@ pub mod virtual_file { #[cfg(target_os = "linux")] TokioEpollUring, } + + /// Direct IO modes for a pageserver. + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] + pub enum DirectIoMode { + /// Direct IO disabled (uses usual buffered IO). + #[default] + Disabled, + /// Direct IO disabled (performs checks and perf simulations). + Evaluate { + /// Alignment check level + alignment_check: DirectIoAlignmentCheckLevel, + /// Latency padded for performance simulation. + latency_padding: DirectIoLatencyPadding, + }, + /// Direct IO enabled. + Enabled { + /// Actions to perform on alignment error. + on_alignment_error: DirectIoOnAlignmentErrorAction, + }, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(rename_all = "kebab-case")] + pub enum DirectIoAlignmentCheckLevel { + #[default] + Error, + Log, + None, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(rename_all = "kebab-case")] + pub enum DirectIoOnAlignmentErrorAction { + Error, + #[default] + FallbackToBuffered, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(tag = "type", rename_all = "kebab-case")] + pub enum DirectIoLatencyPadding { + /// Pad virtual file operations with IO to a fake file. + FakeFileRW { path: PathBuf }, + #[default] + None, + } } // Wrapped in libpq CopyData @@ -1427,6 +1483,7 @@ mod tests { current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, generation: 1, + gc_blocking: None, }; let expected_active = json!({ "id": original_active.id.to_string(), @@ -1449,6 +1506,7 @@ mod tests { current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, generation: 1, + gc_blocking: None, }; let expected_broken = json!({ "id": original_broken.id.to_string(), diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs index ae5a21bab9..ad74d343ae 100644 --- a/libs/pageserver_api/src/models/detach_ancestor.rs +++ b/libs/pageserver_api/src/models/detach_ancestor.rs @@ -1,6 +1,8 @@ +use std::collections::HashSet; + use utils::id::TimelineId; #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)] pub struct AncestorDetached { - pub reparented_timelines: Vec, + pub reparented_timelines: HashSet, } diff --git a/libs/postgres_ffi/src/controlfile_utils.rs b/libs/postgres_ffi/src/controlfile_utils.rs index 0918d15001..eaa9450294 100644 --- a/libs/postgres_ffi/src/controlfile_utils.rs +++ b/libs/postgres_ffi/src/controlfile_utils.rs @@ -29,7 +29,7 @@ use anyhow::{bail, Result}; use bytes::{Bytes, BytesMut}; /// Equivalent to sizeof(ControlFileData) in C -const SIZEOF_CONTROLDATA: usize = std::mem::size_of::(); +const SIZEOF_CONTROLDATA: usize = size_of::(); impl ControlFileData { /// Compute the offset of the `crc` field within the `ControlFileData` struct. diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 54b032d138..6ce855c78e 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -31,7 +31,7 @@ pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; // // Assumes 8 byte alignment -const SIZEOF_PAGE_HEADER_DATA: usize = std::mem::size_of::(); +const SIZEOF_PAGE_HEADER_DATA: usize = size_of::(); pub const MAXALIGN_SIZE_OF_PAGE_HEADER_DATA: usize = (SIZEOF_PAGE_HEADER_DATA + 7) & !7; // @@ -191,7 +191,7 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; -pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::() as u32; +pub const SIZEOF_XLOGRECORD: u32 = size_of::() as u32; // // from xlogrecord.h diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index d25b23663b..9fe7e8198b 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -42,9 +42,9 @@ pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8; pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2; -pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::(); -pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::(); -pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); +pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = size_of::(); +pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = size_of::(); +pub const XLOG_SIZE_OF_XLOG_RECORD: usize = size_of::(); #[allow(clippy::identity_op)] pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; @@ -311,7 +311,7 @@ impl XLogLongPageHeaderData { } } -pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::(); +pub const SIZEOF_CHECKPOINT: usize = size_of::(); impl CheckPoint { pub fn encode(&self) -> Result { diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 750affc94e..79d45de67a 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -178,7 +178,7 @@ pub fn test_find_end_of_wal_last_crossing_segment() { /// currently 1024. #[test] pub fn test_update_next_xid() { - let checkpoint_buf = [0u8; std::mem::size_of::()]; + let checkpoint_buf = [0u8; size_of::()]; let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); checkpoint.nextXid = FullTransactionId { value: 10 }; @@ -204,7 +204,7 @@ pub fn test_update_next_xid() { #[test] pub fn test_update_next_multixid() { - let checkpoint_buf = [0u8; std::mem::size_of::()]; + let checkpoint_buf = [0u8; size_of::()]; let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); // simple case diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index acd95a5255..3c77d5a227 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -33,6 +33,7 @@ use tracing::debug; use utils::backoff; use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; +use crate::ListingObject; use crate::{ config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, @@ -352,7 +353,12 @@ impl RemoteStorage for AzureBlobStorage { let blob_iter = entry .blobs .blobs() - .map(|k| self.name_to_relative_path(&k.name)); + .map(|k| ListingObject{ + key: self.name_to_relative_path(&k.name), + last_modified: k.properties.last_modified.into(), + size: k.properties.content_length, + } + ); for key in blob_iter { res.keys.push(key); diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 0fed86f4b8..2c9e298f79 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -144,15 +144,23 @@ impl RemotePath { /// /// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The /// NoDelimiter mode will only populate `keys`. +#[derive(Copy, Clone)] pub enum ListingMode { WithDelimiter, NoDelimiter, } +#[derive(PartialEq, Eq, Debug)] +pub struct ListingObject { + pub key: RemotePath, + pub last_modified: SystemTime, + pub size: u64, +} + #[derive(Default)] pub struct Listing { pub prefixes: Vec, - pub keys: Vec, + pub keys: Vec, } /// Storage (potentially remote) API to manage its state. @@ -188,7 +196,7 @@ pub trait RemoteStorage: Send + Sync + 'static { mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> impl Stream>; + ) -> impl Stream> + Send; async fn list( &self, @@ -201,7 +209,7 @@ pub trait RemoteStorage: Send + Sync + 'static { let mut combined = stream.next().await.expect("At least one item required")?; while let Some(list) = stream.next().await { let list = list?; - combined.keys.extend_from_slice(&list.keys); + combined.keys.extend(list.keys.into_iter()); combined.prefixes.extend_from_slice(&list.prefixes); } Ok(combined) @@ -345,10 +353,10 @@ impl GenericRemoteStorage> { mode: ListingMode, max_keys: Option, cancel: &'a CancellationToken, - ) -> impl Stream> + 'a { + ) -> impl Stream> + 'a + Send { match self { Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)) - as Pin>>>, + as Pin> + Send>>, Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index a4857b0bba..99b4aa4061 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken}; use utils::crashsafe::path_with_suffix_extension; use crate::{ - Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel, - REMOTE_STORAGE_PREFIX_SEPARATOR, + Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, TimeTravelError, + TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, }; use super::{RemoteStorage, StorageMetadata}; @@ -357,19 +357,29 @@ impl RemoteStorage for LocalFs { .list_recursive(prefix) .await .map_err(DownloadError::Other)?; - let keys = keys + let objects = keys .into_iter() - .filter(|k| { + .filter_map(|k| { let path = k.with_base(&self.storage_root); - !path.is_dir() + if path.is_dir() { + None + } else { + Some(ListingObject { + key: k.clone(), + // LocalFs is just for testing, so just specify a dummy time + last_modified: SystemTime::now(), + size: 0, + }) + } }) .collect(); if let ListingMode::NoDelimiter = mode { - result.keys = keys; + result.keys = objects; } else { let mut prefixes = HashSet::new(); - for key in keys { + for object in objects { + let key = object.key; // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`. let relative_key = if let Some(prefix) = prefix { let mut prefix = prefix.clone(); @@ -398,9 +408,12 @@ impl RemoteStorage for LocalFs { .to_owned(); prefixes.insert(first_part); } else { - result - .keys - .push(RemotePath::from_string(&relative_key).unwrap()); + result.keys.push(ListingObject { + key: RemotePath::from_string(&relative_key).unwrap(), + // LocalFs is just for testing + last_modified: SystemTime::now(), + size: 0, + }); } } result.prefixes = prefixes @@ -950,7 +963,11 @@ mod fs_tests { .await?; assert!(listing.prefixes.is_empty()); assert_eq!( - listing.keys.into_iter().collect::>(), + listing + .keys + .into_iter() + .map(|o| o.key) + .collect::>(), HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()]) ); @@ -975,7 +992,7 @@ mod fs_tests { ) .await?; assert_eq!( - listing.keys, + listing.keys.into_iter().map(|o| o.key).collect::>(), [RemotePath::from_string("uncle").unwrap()].to_vec() ); assert_eq!( @@ -992,7 +1009,7 @@ mod fs_tests { &cancel, ) .await?; - assert_eq!(listing.keys, [].to_vec()); + assert_eq!(listing.keys, vec![]); assert_eq!( listing.prefixes, [RemotePath::from_string("grandparent").unwrap()].to_vec() @@ -1007,7 +1024,7 @@ mod fs_tests { &cancel, ) .await?; - assert_eq!(listing.keys, [].to_vec()); + assert_eq!(listing.keys, vec![]); assert_eq!( listing.prefixes, [RemotePath::from_string("grandparent").unwrap()].to_vec() @@ -1040,7 +1057,7 @@ mod fs_tests { &cancel, ) .await?; - assert_eq!(listing.keys, [].to_vec()); + assert_eq!(listing.keys, vec![]); let mut found_prefixes = listing.prefixes.clone(); found_prefixes.sort(); diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 90ed48e06c..1f25da813d 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -44,8 +44,9 @@ use crate::{ error::Cancelled, metrics::{start_counting_cancelled_wait, start_measuring_requests}, support::PermitCarrying, - ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, - TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, + ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, + RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, + REMOTE_STORAGE_PREFIX_SEPARATOR, }; use crate::metrics::AttemptOutcome; @@ -548,9 +549,29 @@ impl RemoteStorage for S3Bucket { let mut result = Listing::default(); for object in keys { - let object_path = object.key().expect("response does not contain a key"); - let remote_path = self.s3_object_to_relative_path(object_path); - result.keys.push(remote_path); + let key = object.key().expect("response does not contain a key"); + let key = self.s3_object_to_relative_path(key); + + let last_modified = match object.last_modified.map(SystemTime::try_from) { + Some(Ok(t)) => t, + Some(Err(_)) => { + tracing::warn!("Remote storage last_modified {:?} for {} is out of bounds", + object.last_modified, key + ); + SystemTime::now() + }, + None => { + SystemTime::now() + } + }; + + let size = object.size.unwrap_or(0) as u64; + + result.keys.push(ListingObject{ + key, + last_modified, + size, + }); if let Some(mut mk) = max_keys { assert!(mk > 0); mk -= 1; diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 67e5be2955..13f873dcdb 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper { mode: ListingMode, max_keys: Option, cancel: &CancellationToken, - ) -> impl Stream> { + ) -> impl Stream> + Send { async_stream::stream! { self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) .map_err(DownloadError::Other)?; diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index 38c316397a..86c55872c1 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -156,6 +156,7 @@ async fn list_no_delimiter_works( .context("client list root files failure")? .keys .into_iter() + .map(|o| o.key) .collect::>(); assert_eq!( root_files, @@ -182,6 +183,7 @@ async fn list_no_delimiter_works( .context("client list nested files failure")? .keys .into_iter() + .map(|o| o.key) .collect::>(); let trim_remote_blobs: HashSet<_> = ctx .remote_blobs diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 342bc6da0b..b893beeebd 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -81,6 +81,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: .context("list root files failure")? .keys .into_iter() + .map(|o| o.key) .collect::>(), ) } diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index a1170a460d..7b735875b7 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)] #[serde(rename_all = "lowercase")] pub enum Scope { - // Provides access to all data for a specific tenant (specified in `struct Claims` below) + /// Provides access to all data for a specific tenant (specified in `struct Claims` below) // TODO: join these two? Tenant, - // Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs. - // Should only be used e.g. for status check/tenant creation/list. + /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs. + /// Should only be used e.g. for status check/tenant creation/list. PageServerApi, - // Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs. - // Should only be used e.g. for status check. - // Currently also used for connection from any pageserver to any safekeeper. + /// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs. + /// Should only be used e.g. for status check. + /// Currently also used for connection from any pageserver to any safekeeper. SafekeeperData, - // The scope used by pageservers in upcalls to storage controller and cloud control plane + /// The scope used by pageservers in upcalls to storage controller and cloud control plane #[serde(rename = "generations_api")] GenerationsApi, - // Allows access to control plane managment API and some storage controller endpoints. + /// Allows access to control plane managment API and some storage controller endpoints. Admin, /// Allows access to storage controller APIs used by the scrubber, to interrogate the state diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 156b99a010..16ec563fa7 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -78,8 +78,9 @@ impl Drop for GateGuard { } } -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum GateError { + #[error("gate is closed")] GateClosed, } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 0d9343d643..0e748ee3db 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -49,6 +49,7 @@ postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true rand.workspace = true +range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true scopeguard.workspace = true serde.workspace = true @@ -107,3 +108,7 @@ harness = false [[bench]] name = "bench_walredo" harness = false + +[[bench]] +name = "bench_ingest" +harness = false diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs new file mode 100644 index 0000000000..9bab02e46c --- /dev/null +++ b/pageserver/benches/bench_ingest.rs @@ -0,0 +1,239 @@ +use std::{env, num::NonZeroUsize}; + +use bytes::Bytes; +use camino::Utf8PathBuf; +use criterion::{criterion_group, criterion_main, Criterion}; +use pageserver::{ + config::PageServerConf, + context::{DownloadBehavior, RequestContext}, + l0_flush::{L0FlushConfig, L0FlushGlobalState}, + page_cache, + repository::Value, + task_mgr::TaskKind, + tenant::storage_layer::InMemoryLayer, + virtual_file, +}; +use pageserver_api::{key::Key, shard::TenantShardId}; +use utils::{ + bin_ser::BeSer, + id::{TenantId, TimelineId}, +}; + +// A very cheap hash for generating non-sequential keys. +fn murmurhash32(mut h: u32) -> u32 { + h ^= h >> 16; + h = h.wrapping_mul(0x85ebca6b); + h ^= h >> 13; + h = h.wrapping_mul(0xc2b2ae35); + h ^= h >> 16; + h +} + +enum KeyLayout { + /// Sequential unique keys + Sequential, + /// Random unique keys + Random, + /// Random keys, but only use the bits from the mask of them + RandomReuse(u32), +} + +enum WriteDelta { + Yes, + No, +} + +async fn ingest( + conf: &'static PageServerConf, + put_size: usize, + put_count: usize, + key_layout: KeyLayout, + write_delta: WriteDelta, +) -> anyhow::Result<()> { + let mut lsn = utils::lsn::Lsn(1000); + let mut key = Key::from_i128(0x0); + + let timeline_id = TimelineId::generate(); + let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?; + + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + + let gate = utils::sync::gate::Gate::default(); + let entered = gate.enter().unwrap(); + + let layer = + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?; + + let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?; + let ctx = RequestContext::new( + pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler, + pageserver::context::DownloadBehavior::Download, + ); + + for i in 0..put_count { + lsn += put_size as u64; + + // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people + // usually care the most about write performance when they're blasting a huge batch of data into a huge table. + match key_layout { + KeyLayout::Sequential => { + // Use sequential order to illustrate the experience a user is likely to have + // when ingesting bulk data. + key.field6 = i as u32; + } + KeyLayout::Random => { + // Use random-order keys to avoid giving a false advantage to data structures that are + // faster when inserting on the end. + key.field6 = murmurhash32(i as u32); + } + KeyLayout::RandomReuse(mask) => { + // Use low bits only, to limit cardinality + key.field6 = murmurhash32(i as u32) & mask; + } + } + + layer.put_value(key, lsn, &data, &ctx).await?; + } + layer.freeze(lsn + 1).await; + + if matches!(write_delta, WriteDelta::Yes) { + let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct { + max_concurrency: NonZeroUsize::new(1).unwrap(), + }); + let (_desc, path) = layer + .write_to_disk(&ctx, None, l0_flush_state.inner()) + .await? + .unwrap(); + tokio::fs::remove_file(path).await?; + } + + Ok(()) +} + +/// Wrapper to instantiate a tokio runtime +fn ingest_main( + conf: &'static PageServerConf, + put_size: usize, + put_count: usize, + key_layout: KeyLayout, + write_delta: WriteDelta, +) { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + runtime.block_on(async move { + let r = ingest(conf, put_size, put_count, key_layout, write_delta).await; + if let Err(e) = r { + panic!("{e:?}"); + } + }); +} + +/// Declare a series of benchmarks for the Pageserver's ingest write path. +/// +/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either +/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set). +/// +/// Genuine disk I/O is used, so expect results to differ depending on storage. However, when running on +/// a fast disk, CPU is the bottleneck at time of writing. +fn criterion_benchmark(c: &mut Criterion) { + let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap(); + let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap(); + eprintln!("Data directory: {}", temp_dir.path()); + + let conf: &'static PageServerConf = Box::leak(Box::new( + pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()), + )); + virtual_file::init(16384, virtual_file::io_engine_for_bench()); + page_cache::init(conf.page_cache_size); + + { + let mut group = c.benchmark_group("ingest-small-values"); + let put_size = 100usize; + let put_count = 128 * 1024 * 1024 / put_size; + group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64)); + group.sample_size(10); + group.bench_function("ingest 128MB/100b seq", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b rand", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Random, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b rand-1024keys", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::RandomReuse(0x3ff), + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b seq, no delta", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::No, + ) + }) + }); + } + + { + let mut group = c.benchmark_group("ingest-big-values"); + let put_size = 8192usize; + let put_count = 128 * 1024 * 1024 / put_size; + group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64)); + group.sample_size(10); + group.bench_function("ingest 128MB/8k seq", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/8k seq, no delta", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::No, + ) + }) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 1d02aa7709..1353e79f7c 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,3 +1,4 @@ +use criterion::measurement::WallTime; use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; @@ -15,7 +16,11 @@ use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion}; + +fn fixture_path(relative: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) +} fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut layer_map = LayerMap::default(); @@ -109,7 +114,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning // between each test run. fn bench_from_captest_env(c: &mut Criterion) { // TODO consider compressing this file - let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); // Test with uniform query pattern @@ -139,7 +144,7 @@ fn bench_from_captest_env(c: &mut Criterion) { fn bench_from_real_project(c: &mut Criterion) { // Init layer map let now = Instant::now(); - let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); println!("Finished layer map init in {:?}", now.elapsed()); // Choose uniformly distributed queries @@ -242,7 +247,72 @@ fn bench_sequential(c: &mut Criterion) { group.finish(); } +fn bench_visibility_with_map( + group: &mut BenchmarkGroup, + layer_map: LayerMap, + read_points: Vec, + bench_name: &str, +) { + group.bench_function(bench_name, |b| { + b.iter(|| black_box(layer_map.get_visibility(read_points.clone()))); + }); +} + +// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines. +fn bench_visibility(c: &mut Criterion) { + let mut group = c.benchmark_group("visibility"); + { + // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines. + let now = Instant::now(); + let mut layer_map = LayerMap::default(); + let mut updates = layer_map.batch_update(); + for i in 0..100_000 { + let i32 = (i as u32) % 100; + let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); + let layer = PersistentLayerDesc::new_img( + TenantShardId::unsharded(TenantId::generate()), + TimelineId::generate(), + zero.add(10 * i32)..zero.add(10 * i32 + 1), + Lsn(i), + 0, + ); + updates.insert_historic(layer); + } + updates.flush(); + println!("Finished layer map init in {:?}", now.elapsed()); + + let mut read_points = Vec::new(); + for i in (0..100_000).step_by(1000) { + read_points.push(Lsn(i)); + } + + bench_visibility_with_map(&mut group, layer_map, read_points, "sequential"); + } + + { + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); + let read_points = vec![Lsn(0x1C760FA190)]; + bench_visibility_with_map(&mut group, layer_map, read_points, "real_map"); + + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); + let read_points = vec![ + Lsn(0x1C760FA190), + Lsn(0x000000931BEAD539), + Lsn(0x000000931BF63011), + Lsn(0x000000931B33AE68), + Lsn(0x00000038E67ABFA0), + Lsn(0x000000931B33AE68), + Lsn(0x000000914E3F38F0), + Lsn(0x000000931B33AE68), + ]; + bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches"); + } + + group.finish(); +} + criterion_group!(group_1, bench_from_captest_env); criterion_group!(group_2, bench_from_real_project); criterion_group!(group_3, bench_sequential); -criterion_main!(group_1, group_2, group_3); +criterion_group!(group_4, bench_visibility); +criterion_main!(group_1, group_2, group_3, group_4); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 7a96c86ded..932918410c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -17,11 +17,9 @@ use pageserver::config::PageserverIdentity; use pageserver::control_plane_client::ControlPlaneClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; -use pageserver::task_mgr::WALRECEIVER_RUNTIME; +use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME}; use pageserver::tenant::{secondary, TenantSharedResources}; -use pageserver::{ - CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener, -}; +use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener}; use remote_storage::GenericRemoteStorage; use tokio::signal::unix::SignalKind; use tokio::time::Instant; @@ -31,11 +29,9 @@ use tracing::*; use metrics::set_build_info_metric; use pageserver::{ config::PageServerConf, - context::{DownloadBehavior, RequestContext}, deletion_queue::DeletionQueue, http, page_cache, page_service, task_mgr, - task_mgr::TaskKind, - task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME}, + task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME}, tenant::mgr, virtual_file, }; @@ -127,8 +123,10 @@ fn main() -> anyhow::Result<()> { // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); + info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); info!(?conf.get_impl, "starting with get page implementation"); info!(?conf.get_vectored_impl, "starting with vectored get page implementation"); + info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access"); let tenants_path = conf.tenants_path(); if !tenants_path.exists() { @@ -593,30 +591,13 @@ fn start_pageserver( // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. We created the listener earlier already. - let libpq_listener = { - let cancel = CancellationToken::new(); - let libpq_ctx = RequestContext::todo_child( - TaskKind::LibpqEndpointListener, - // listener task shouldn't need to download anything. (We will - // create a separate sub-contexts for each connection, with their - // own download behavior. This context is used only to listen and - // accept connections.) - DownloadBehavior::Error, - ); - - let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( - "libpq listener", - page_service::libpq_listener_main( - tenant_manager.clone(), - pg_auth, - pageserver_listener, - conf.pg_auth_type, - libpq_ctx, - cancel.clone(), - ), - )); - LibpqEndpointListener(CancellableTask { task, cancel }) - }; + let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, { + let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it + pageserver_listener + .set_nonblocking(true) + .context("set listener to nonblocking")?; + tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")? + }); let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); @@ -644,7 +625,7 @@ fn start_pageserver( shutdown_pageserver.take(); pageserver::shutdown_pageserver( http_endpoint_listener, - libpq_listener, + page_service, consumption_metrics_tasks, disk_usage_eviction_task, &tenant_manager, diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 20e78b1d85..f4c367bd4d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -29,6 +29,7 @@ use utils::{ logging::LogFormat, }; +use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; @@ -52,7 +53,7 @@ pub mod defaults { use pageserver_api::models::ImageCompressionAlgorithm; pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; - pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; @@ -83,16 +84,16 @@ pub mod defaults { #[cfg(not(target_os = "linux"))] pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs"; - pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential"; + pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored"; - pub const DEFAULT_GET_IMPL: &str = "legacy"; + pub const DEFAULT_GET_IMPL: &str = "vectored"; pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = ImageCompressionAlgorithm::Disabled; - pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true; + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; @@ -295,6 +296,13 @@ pub struct PageServerConf { pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: L0FlushConfig, + + /// This flag is temporary and will be removed after gradual rollout. + /// See . + pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess, + + /// Direct IO settings + pub virtual_file_direct_io: virtual_file::DirectIoMode, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -356,8 +364,6 @@ struct PageServerConfigBuilder { auth_validation_public_key_path: BuilderValue>, remote_storage_config: BuilderValue>, - id: BuilderValue, - broker_endpoint: BuilderValue, broker_keepalive_interval: BuilderValue, @@ -403,14 +409,15 @@ struct PageServerConfigBuilder { ephemeral_bytes_per_memory_kb: BuilderValue, l0_flush: BuilderValue, + + compact_level0_phase1_value_access: BuilderValue, + + virtual_file_direct_io: BuilderValue, } impl PageServerConfigBuilder { - fn new(node_id: NodeId) -> Self { - let mut this = Self::default(); - this.id(node_id); - - this + fn new() -> Self { + Self::default() } #[inline(always)] @@ -438,7 +445,6 @@ impl PageServerConfigBuilder { pg_auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), - id: NotSet, broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT .parse() .expect("failed to parse default broker endpoint")), @@ -496,6 +502,8 @@ impl PageServerConfigBuilder { validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: Set(L0FlushConfig::default()), + compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()), + virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()), } } } @@ -568,10 +576,6 @@ impl PageServerConfigBuilder { self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval) } - pub fn id(&mut self, node_id: NodeId) { - self.id = BuilderValue::Set(node_id) - } - pub fn log_format(&mut self, log_format: LogFormat) { self.log_format = BuilderValue::Set(log_format) } @@ -683,7 +687,15 @@ impl PageServerConfigBuilder { self.l0_flush = BuilderValue::Set(value); } - pub fn build(self) -> anyhow::Result { + pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) { + self.compact_level0_phase1_value_access = BuilderValue::Set(value); + } + + pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) { + self.virtual_file_direct_io = BuilderValue::Set(value); + } + + pub fn build(self, id: NodeId) -> anyhow::Result { let default = Self::default_values(); macro_rules! conf { @@ -716,7 +728,6 @@ impl PageServerConfigBuilder { pg_auth_type, auth_validation_public_key_path, remote_storage_config, - id, broker_endpoint, broker_keepalive_interval, log_format, @@ -741,9 +752,12 @@ impl PageServerConfigBuilder { image_compression, ephemeral_bytes_per_memory_kb, l0_flush, + compact_level0_phase1_value_access, + virtual_file_direct_io, } CUSTOM LOGIC { + id: id, // TenantConf is handled separately default_tenant_conf: TenantConf::default(), concurrent_tenant_warmup: ConfigurableSemaphore::new({ @@ -893,7 +907,7 @@ impl PageServerConf { toml: &Document, workdir: &Utf8Path, ) -> anyhow::Result { - let mut builder = PageServerConfigBuilder::new(node_id); + let mut builder = PageServerConfigBuilder::new(); builder.workdir(workdir.to_owned()); let mut t_conf = TenantConfOpt::default(); @@ -924,8 +938,6 @@ impl PageServerConf { "tenant_config" => { t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; } - "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth - // Logging is not set up yet, so we can't do it. "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), "log_format" => builder.log_format( @@ -1014,11 +1026,17 @@ impl PageServerConf { "l0_flush" => { builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?) } + "compact_level0_phase1_value_access" => { + builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?) + } + "virtual_file_direct_io" => { + builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?) + } _ => bail!("unrecognized pageserver option '{key}'"), } } - let mut conf = builder.build().context("invalid config")?; + let mut conf = builder.build(node_id).context("invalid config")?; if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf @@ -1098,6 +1116,8 @@ impl PageServerConf { validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), + compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + virtual_file_direct_io: virtual_file::DirectIoMode::default(), } } } @@ -1255,7 +1275,6 @@ max_file_descriptors = 333 # initial superuser role name to use when creating a new tenant initial_superuser_name = 'zzzz' -id = 10 metric_collection_interval = '222 s' metric_collection_endpoint = 'http://localhost:80/metrics' @@ -1272,9 +1291,8 @@ background_task_maximum_delay = '334 s' let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; // we have to create dummy values to overcome the validation errors - let config_string = format!( - "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'", - ); + let config_string = + format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",); let toml = config_string.parse()?; let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) @@ -1341,6 +1359,8 @@ background_task_maximum_delay = '334 s' image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), + compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + virtual_file_direct_io: virtual_file::DirectIoMode::default(), }, "Correct defaults should be used when no config values are provided" ); @@ -1415,6 +1435,8 @@ background_task_maximum_delay = '334 s' image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), + compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + virtual_file_direct_io: virtual_file::DirectIoMode::default(), }, "Should be able to parse all basic config values correctly" ); @@ -1579,7 +1601,6 @@ broker_endpoint = '{broker_endpoint}' r#"pg_distrib_dir = "{pg_distrib_dir}" metric_collection_endpoint = "http://sample.url" metric_collection_interval = "10min" -id = 222 [disk_usage_based_eviction] max_usage_pct = 80 @@ -1649,7 +1670,6 @@ threshold = "20m" r#"pg_distrib_dir = "{pg_distrib_dir}" metric_collection_endpoint = "http://sample.url" metric_collection_interval = "10min" -id = 222 [tenant_config] evictions_low_residence_duration_metric_threshold = "20m" diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 26e7cc7ef8..b5d9267d79 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -171,14 +171,14 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { register, }; - fail::fail_point!("control-plane-client-re-attach"); - let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; tracing::info!( "Received re-attach response with {} tenants", response.tenants.len() ); + failpoint_support::sleep_millis_async!("control-plane-client-re-attach"); + Ok(response .tenants .into_iter() diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 59e646d0ca..42086dc2e6 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -308,6 +308,45 @@ paths: application/json: schema: type: string + + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Persistently add a gc blocking at the tenant level because of this timeline + responses: + "200": + description: OK + + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Persistently remove a tenant level gc blocking for this timeline + responses: + "200": + description: OK + /v1/tenant/{tenant_shard_id}/location_config: parameters: - name: tenant_shard_id @@ -893,7 +932,7 @@ components: description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything. ArchivalConfigRequest: type: object - required + required: - state properties: state: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index d63c240365..a983d8c4c2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -296,6 +296,11 @@ impl From for ApiError { GetActiveTenantError::WaitForActiveTimeout { .. } => { ApiError::ResourceUnavailable(format!("{}", e).into()) } + GetActiveTenantError::SwitchedTenant => { + // in our HTTP handlers, this error doesn't happen + // TODO: separate error types + ApiError::ResourceUnavailable("switched tenant".into()) + } } } } @@ -930,6 +935,7 @@ async fn tenant_list_handler( generation: (*gen) .into() .expect("Tenants are always attached with a generation"), + gc_blocking: None, }) .collect::>(); @@ -981,6 +987,7 @@ async fn tenant_status( .generation() .into() .expect("Tenants are always attached with a generation"), + gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")), }, walredo: tenant.wal_redo_manager_status(), timelines: tenant.list_timeline_ids(), @@ -1155,7 +1162,10 @@ async fn layer_map_info_handler( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - let layer_map_info = timeline.layer_map_info(reset).await; + let layer_map_info = timeline + .layer_map_info(reset) + .await + .map_err(|_shutdown| ApiError::ShuttingDown)?; json_response(StatusCode::OK, layer_map_info) } @@ -1221,6 +1231,72 @@ async fn evict_timeline_layer_handler( } } +async fn timeline_gc_blocking_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + block_or_unblock_gc(request, true).await +} + +async fn timeline_gc_unblocking_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + block_or_unblock_gc(request, false).await +} + +/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`. +/// +/// Both are technically unsafe because they might fire off index uploads, thus they are POST. +async fn block_or_unblock_gc( + request: Request, + block: bool, +) -> Result, ApiError> { + use crate::tenant::{ + remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized, + }; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let timeline = tenant.get_timeline(timeline_id, true)?; + + let fut = async { + if block { + timeline.block_gc(&tenant).await.map(|_| ()) + } else { + timeline.unblock_gc(&tenant).await + } + }; + + let span = tracing::info_span!( + "block_or_unblock_gc", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + timeline_id = %timeline_id, + block = block, + ); + + let res = fut.instrument(span).await; + + res.map_err(|e| { + if e.is::() || e.is::() { + ApiError::ShuttingDown + } else { + ApiError::InternalServerError(e) + } + })?; + + json_response(StatusCode::OK, ()) +} + /// Get tenant_size SVG graph along with the JSON data. fn synthetic_size_html_response( inputs: ModelInputs, @@ -1650,7 +1726,9 @@ async fn timeline_compact_handler( .await .map_err(|e| ApiError::InternalServerError(e.into()))?; if wait_until_uploaded { - timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + timeline.remote_client.wait_completion().await + // XXX map to correct ApiError for the cases where it's due to shutdown + .context("wait completion").map_err(ApiError::InternalServerError)?; } json_response(StatusCode::OK, ()) } @@ -1709,7 +1787,9 @@ async fn timeline_checkpoint_handler( } if wait_until_uploaded { - timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + timeline.remote_client.wait_completion().await + // XXX map to correct ApiError for the cases where it's due to shutdown + .context("wait completion").map_err(ApiError::InternalServerError)?; } json_response(StatusCode::OK, ()) @@ -2125,14 +2205,24 @@ async fn secondary_download_handler( let timeout = wait.unwrap_or(Duration::MAX); - let status = match tokio::time::timeout( + let result = tokio::time::timeout( timeout, state.secondary_controller.download_tenant(tenant_shard_id), ) - .await - { - // Download job ran to completion. - Ok(Ok(())) => StatusCode::OK, + .await; + + let progress = secondary_tenant.progress.lock().unwrap().clone(); + + let status = match result { + Ok(Ok(())) => { + if progress.layers_downloaded >= progress.layers_total { + // Download job ran to completion + StatusCode::OK + } else { + // Download dropped out without errors because it ran out of time budget + StatusCode::ACCEPTED + } + } // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered // okay. We could get an error here in the unlikely edge case that the tenant // was detached between our check above and executing the download job. @@ -2142,8 +2232,6 @@ async fn secondary_download_handler( Err(_) => StatusCode::ACCEPTED, }; - let progress = secondary_tenant.progress.lock().unwrap().clone(); - json_response(status, progress) } @@ -2887,6 +2975,14 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, evict_timeline_layer_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc", + |r| api_handler(r, timeline_gc_blocking_handler), + ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc", + |r| api_handler(r, timeline_gc_unblocking_handler), + ) .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { api_handler(r, secondary_upload_handler) }) diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index 7fe8fedc63..10187f2ba3 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -2,19 +2,29 @@ use std::{num::NonZeroUsize, sync::Arc}; use crate::tenant::ephemeral_file; -#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)] #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] pub enum L0FlushConfig { - #[default] PageCached, #[serde(rename_all = "snake_case")] - Direct { max_concurrency: NonZeroUsize }, + Direct { + max_concurrency: NonZeroUsize, + }, +} + +impl Default for L0FlushConfig { + fn default() -> Self { + Self::Direct { + // TODO: using num_cpus results in different peak memory usage on different instance types. + max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(), + } + } } #[derive(Clone)] pub struct L0FlushGlobalState(Arc); -pub(crate) enum Inner { +pub enum Inner { PageCached, Direct { semaphore: tokio::sync::Semaphore }, } @@ -30,7 +40,7 @@ impl L0FlushGlobalState { } } - pub(crate) fn inner(&self) -> &Arc { + pub fn inner(&self) -> &Arc { &self.0 } } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index d944019641..5aee13cfc6 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -12,6 +12,8 @@ pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; pub mod l0_flush; + +use futures::{stream::FuturesUnordered, StreamExt}; pub use pageserver_api::keyspace; use tokio_util::sync::CancellationToken; pub mod aux_file; @@ -30,14 +32,13 @@ pub mod walingest; pub mod walrecord; pub mod walredo; -use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; use tenant::{ mgr::{BackgroundPurges, TenantManager}, secondary, }; -use tracing::info; +use tracing::{info, info_span}; /// Current storage format version /// @@ -63,7 +64,6 @@ pub struct CancellableTask { pub cancel: CancellationToken, } pub struct HttpEndpointListener(pub CancellableTask); -pub struct LibpqEndpointListener(pub CancellableTask); pub struct ConsumptionMetricsTasks(pub CancellableTask); pub struct DiskUsageEvictionTask(pub CancellableTask); impl CancellableTask { @@ -77,7 +77,7 @@ impl CancellableTask { #[allow(clippy::too_many_arguments)] pub async fn shutdown_pageserver( http_listener: HttpEndpointListener, - libpq_listener: LibpqEndpointListener, + page_service: page_service::Listener, consumption_metrics_worker: ConsumptionMetricsTasks, disk_usage_eviction_task: Option, tenant_manager: &TenantManager, @@ -87,10 +87,83 @@ pub async fn shutdown_pageserver( exit_code: i32, ) { use std::time::Duration; + + // If the orderly shutdown below takes too long, we still want to make + // sure that all walredo processes are killed and wait()ed on by us, not systemd. + // + // (Leftover walredo processes are the hypothesized trigger for the systemd freezes + // that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387. + // + // We use a thread instead of a tokio task because the background runtime is likely busy + // with the final flushing / uploads. This activity here has priority, and due to lack + // of scheduling priority feature sin the tokio scheduler, using a separate thread is + // an effective priority booster. + let walredo_extraordinary_shutdown_thread_span = { + let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread"); + span.follows_from(tracing::Span::current()); + span + }; + let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new(); + let walredo_extraordinary_shutdown_thread = std::thread::spawn({ + let walredo_extraordinary_shutdown_thread_cancel = + walredo_extraordinary_shutdown_thread_cancel.clone(); + move || { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let _entered = rt.enter(); + let _entered = walredo_extraordinary_shutdown_thread_span.enter(); + if let Ok(()) = rt.block_on(tokio::time::timeout( + Duration::from_secs(8), + walredo_extraordinary_shutdown_thread_cancel.cancelled(), + )) { + info!("cancellation requested"); + return; + } + let managers = tenant::WALREDO_MANAGERS + .lock() + .unwrap() + // prevents new walredo managers from being inserted + .take() + .expect("only we take()"); + // Use FuturesUnordered to get in queue early for each manager's + // heavier_once_cell semaphore wait list. + // Also, for idle tenants that for some reason haven't + // shut down yet, it's quite likely that we're not going + // to get Poll::Pending once. + let mut futs: FuturesUnordered<_> = managers + .into_iter() + .filter_map(|(_, mgr)| mgr.upgrade()) + .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await }) + .collect(); + info!(count=%futs.len(), "built FuturesUnordered"); + let mut last_log_at = std::time::Instant::now(); + #[derive(Debug, Default)] + struct Results { + initiated: u64, + already: u64, + } + let mut results = Results::default(); + while let Some(we_initiated) = rt.block_on(futs.next()) { + if we_initiated { + results.initiated += 1; + } else { + results.already += 1; + } + if last_log_at.elapsed() > Duration::from_millis(100) { + info!(remaining=%futs.len(), ?results, "progress"); + last_log_at = std::time::Instant::now(); + } + } + info!(?results, "done"); + } + }); + // Shut down the libpq endpoint task. This prevents new connections from // being accepted. - timed( - libpq_listener.0.shutdown(), + let remaining_connections = timed( + page_service.stop_accepting(), "shutdown LibpqEndpointListener", Duration::from_secs(1), ) @@ -108,7 +181,7 @@ pub async fn shutdown_pageserver( // Shut down any page service tasks: any in-progress work for particular timelines or tenants // should already have been canclled via mgr::shutdown_all_tenants timed( - task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None), + remaining_connections.shutdown(), "shutdown PageRequestHandlers", Duration::from_secs(1), ) @@ -162,6 +235,12 @@ pub async fn shutdown_pageserver( Duration::from_secs(1), ) .await; + + info!("cancel & join walredo_extraordinary_shutdown_thread"); + walredo_extraordinary_shutdown_thread_cancel.cancel(); + walredo_extraordinary_shutdown_thread.join().unwrap(); + info!("walredo_extraordinary_shutdown_thread done"); + info!("Shut down successfully completed"); std::process::exit(exit_code); } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c03567f6ef..cd2cd43f27 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -525,6 +525,15 @@ static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static VISIBLE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_visible_physical_size", + "The size of the layer files present in the pageserver's filesystem.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_resident_physical_size_global", @@ -613,7 +622,23 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy = Lazy::new(|| { pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_compression_image_in_bytes_total", - "Size of uncompressed data written into image layers" + "Size of data written into image layers before compression" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_in_bytes_considered", + "Size of potentially compressible data written into image layers before compression" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_in_bytes_chosen", + "Size of data whose compressed form was written into image layers" ) .expect("failed to define a metric") }); @@ -2188,6 +2213,7 @@ pub(crate) struct TimelineMetrics { pub(crate) layer_count_delta: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, + pub visible_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub aux_file_size_gauge: IntGauge, @@ -2310,6 +2336,9 @@ impl TimelineMetrics { let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); // TODO: we shouldn't expose this metric let current_logical_size_gauge = CURRENT_LOGICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) @@ -2364,6 +2393,7 @@ impl TimelineMetrics { layer_count_delta, standby_horizon_gauge, resident_physical_size_gauge, + visible_physical_size_gauge, current_logical_size_gauge, aux_file_size_gauge, directory_entries_count_gauge, @@ -2415,6 +2445,7 @@ impl TimelineMetrics { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); } + let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) { let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); @@ -3104,6 +3135,8 @@ pub fn preinitialize_metrics() { &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES, &REMOTE_ONDEMAND_DOWNLOADED_LAYERS, &REMOTE_ONDEMAND_DOWNLOADED_BYTES, + &CIRCUIT_BREAKERS_BROKEN, + &CIRCUIT_BREAKERS_UNBROKEN, ] .into_iter() .for_each(|c| { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 6353f713e0..81294291a9 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -4,9 +4,8 @@ use anyhow::Context; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; -use futures::stream::FuturesUnordered; -use futures::StreamExt; -use pageserver_api::key::Key; +use futures::FutureExt; +use once_cell::sync::OnceCell; use pageserver_api::models::TenantState; use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, @@ -15,28 +14,23 @@ use pageserver_api::models::{ PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, PagestreamProtocolVersion, }; -use pageserver_api::shard::ShardIndex; -use pageserver_api::shard::ShardNumber; use pageserver_api::shard::TenantShardId; use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::borrow::Cow; -use std::collections::HashMap; use std::io; -use std::net::TcpListener; use std::str; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; use std::time::SystemTime; +use std::time::{Duration, Instant}; use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::sync::gate::GateGuard; use utils::{ auth::{Claims, Scope, SwappableJwtAuth}, id::{TenantId, TimelineId}, @@ -47,91 +41,150 @@ use utils::{ use crate::auth::check_permission; use crate::basebackup; use crate::basebackup::BasebackupError; +use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics; use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; use crate::pgdatadir_mapping::Version; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; -use crate::task_mgr; use crate::task_mgr::TaskKind; -use crate::tenant::mgr::GetActiveTenantError; -use crate::tenant::mgr::GetTenantError; -use crate::tenant::mgr::ShardResolveResult; +use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME}; use crate::tenant::mgr::ShardSelector; use crate::tenant::mgr::TenantManager; -use crate::tenant::timeline::WaitLsnError; +use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult}; +use crate::tenant::timeline::{self, WaitLsnError}; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; -use crate::tenant::Tenant; use crate::tenant::Timeline; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; -// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which -// is not yet in state [`TenantState::Active`]. +/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which +/// is not yet in state [`TenantState::Active`]. +/// +/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); /////////////////////////////////////////////////////////////////////////////// +pub struct Listener { + cancel: CancellationToken, + /// Cancel the listener task through `listen_cancel` to shut down the listener + /// and get a handle on the existing connections. + task: JoinHandle, +} + +pub struct Connections { + cancel: CancellationToken, + tasks: tokio::task::JoinSet, +} + +pub fn spawn( + conf: &'static PageServerConf, + tenant_manager: Arc, + pg_auth: Option>, + tcp_listener: tokio::net::TcpListener, +) -> Listener { + let cancel = CancellationToken::new(); + let libpq_ctx = RequestContext::todo_child( + TaskKind::LibpqEndpointListener, + // listener task shouldn't need to download anything. (We will + // create a separate sub-contexts for each connection, with their + // own download behavior. This context is used only to listen and + // accept connections.) + DownloadBehavior::Error, + ); + let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "libpq listener", + libpq_listener_main( + tenant_manager, + pg_auth, + tcp_listener, + conf.pg_auth_type, + libpq_ctx, + cancel.clone(), + ) + .map(anyhow::Ok), + )); + + Listener { cancel, task } +} + +impl Listener { + pub async fn stop_accepting(self) -> Connections { + self.cancel.cancel(); + self.task + .await + .expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error") + } +} +impl Connections { + pub(crate) async fn shutdown(self) { + let Self { cancel, mut tasks } = self; + cancel.cancel(); + while let Some(res) = tasks.join_next().await { + Self::handle_connection_completion(res); + } + } + + fn handle_connection_completion(res: Result, tokio::task::JoinError>) { + match res { + Ok(Ok(())) => {} + Ok(Err(e)) => error!("error in page_service connection task: {:?}", e), + Err(e) => error!("page_service connection task panicked: {:?}", e), + } + } +} + /// /// Main loop of the page service. /// /// Listens for connections, and launches a new handler task for each. /// +/// Returns Ok(()) upon cancellation via `cancel`, returning the set of +/// open connections. +/// pub async fn libpq_listener_main( tenant_manager: Arc, auth: Option>, - listener: TcpListener, + listener: tokio::net::TcpListener, auth_type: AuthType, listener_ctx: RequestContext, - cancel: CancellationToken, -) -> anyhow::Result<()> { - listener.set_nonblocking(true)?; - let tokio_listener = tokio::net::TcpListener::from_std(listener)?; + listener_cancel: CancellationToken, +) -> Connections { + let connections_cancel = CancellationToken::new(); + let mut connection_handler_tasks = tokio::task::JoinSet::default(); - // Wait for a new connection to arrive, or for server shutdown. - while let Some(res) = tokio::select! { - biased; + loop { + let accepted = tokio::select! { + biased; + _ = listener_cancel.cancelled() => break, + next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => { + let res = next.expect("we dont poll while empty"); + Connections::handle_connection_completion(res); + continue; + } + accepted = listener.accept() => accepted, + }; - _ = cancel.cancelled() => { - // We were requested to shut down. - None - } - - res = tokio_listener.accept() => { - Some(res) - } - } { - match res { + match accepted { Ok((socket, peer_addr)) => { // Connection established. Spawn a new task to handle it. debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); - let connection_ctx = listener_ctx .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download); - - // PageRequestHandler tasks are not associated with any particular - // timeline in the task manager. In practice most connections will - // only deal with a particular timeline, but we don't know which one - // yet. - task_mgr::spawn( - &tokio::runtime::Handle::current(), - TaskKind::PageRequestHandler, - None, - None, - "serving compute connection task", - page_service_conn_main( - tenant_manager.clone(), - local_auth, - socket, - auth_type, - connection_ctx, - ), - ); + connection_handler_tasks.spawn(page_service_conn_main( + tenant_manager.clone(), + local_auth, + socket, + auth_type, + connection_ctx, + connections_cancel.child_token(), + )); } Err(err) => { // accept() failed. Log the error, and loop back to retry on next connection. @@ -140,11 +193,16 @@ pub async fn libpq_listener_main( } } - debug!("page_service loop terminated"); + debug!("page_service listener loop terminated"); - Ok(()) + Connections { + cancel: connections_cancel, + tasks: connection_handler_tasks, + } } +type ConnectionHandlerResult = anyhow::Result<()>; + #[instrument(skip_all, fields(peer_addr))] async fn page_service_conn_main( tenant_manager: Arc, @@ -152,7 +210,8 @@ async fn page_service_conn_main( socket: tokio::net::TcpStream, auth_type: AuthType, connection_ctx: RequestContext, -) -> anyhow::Result<()> { + cancel: CancellationToken, +) -> ConnectionHandlerResult { let _guard = LIVE_CONNECTIONS .with_label_values(&["page_service"]) .guard(); @@ -200,13 +259,11 @@ async fn page_service_conn_main( // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. - let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx); + let mut conn_handler = + PageServerHandler::new(tenant_manager, auth, connection_ctx, cancel.clone()); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; - match pgbackend - .run(&mut conn_handler, &task_mgr::shutdown_token()) - .await - { + match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { // we've been requested to shut down Ok(()) @@ -223,32 +280,154 @@ async fn page_service_conn_main( } } -/// While a handler holds a reference to a Timeline, it also holds a the -/// timeline's Gate open. -struct HandlerTimeline { - timeline: Arc, - _guard: GateGuard, -} - struct PageServerHandler { auth: Option>, claims: Option, - tenant_manager: Arc, - /// The context created for the lifetime of the connection /// services by this PageServerHandler. /// For each query received over the connection, /// `process_query` creates a child context from this one. connection_ctx: RequestContext, - /// See [`Self::cache_timeline`] for usage. - /// + cancel: CancellationToken, + + timeline_handles: TimelineHandles, +} + +struct TimelineHandles { + wrapper: TenantManagerWrapper, /// Note on size: the typical size of this map is 1. The largest size we expect /// to see is the number of shards divided by the number of pageservers (typically < 2), /// or the ratio used when splitting shards (i.e. how many children created from one) /// parent shard, where a "large" number might be ~8. - shard_timelines: HashMap, + handles: timeline::handle::Cache, +} + +impl TimelineHandles { + fn new(tenant_manager: Arc) -> Self { + Self { + wrapper: TenantManagerWrapper { + tenant_manager, + tenant_id: OnceCell::new(), + }, + handles: Default::default(), + } + } + async fn get( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> Result, GetActiveTimelineError> { + if *self.wrapper.tenant_id.get_or_init(|| tenant_id) != tenant_id { + return Err(GetActiveTimelineError::Tenant( + GetActiveTenantError::SwitchedTenant, + )); + } + self.handles + .get(timeline_id, shard_selector, &self.wrapper) + .await + .map_err(|e| match e { + timeline::handle::GetError::TenantManager(e) => e, + timeline::handle::GetError::TimelineGateClosed => { + trace!("timeline gate closed"); + GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) + } + timeline::handle::GetError::PerTimelineStateShutDown => { + trace!("per-timeline state shut down"); + GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) + } + }) + } +} + +pub(crate) struct TenantManagerWrapper { + tenant_manager: Arc, + // We do not support switching tenant_id on a connection at this point. + // We can can add support for this later if needed without changing + // the protocol. + tenant_id: once_cell::sync::OnceCell, +} + +#[derive(Debug)] +pub(crate) struct TenantManagerTypes; + +impl timeline::handle::Types for TenantManagerTypes { + type TenantManagerError = GetActiveTimelineError; + type TenantManager = TenantManagerWrapper; + type Timeline = Arc; +} + +impl timeline::handle::ArcTimeline for Arc { + fn gate(&self) -> &utils::sync::gate::Gate { + &self.gate + } + + fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId { + Timeline::shard_timeline_id(self) + } + + fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState { + &self.handles + } + + fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity { + Timeline::get_shard_identity(self) + } +} + +impl timeline::handle::TenantManager for TenantManagerWrapper { + async fn resolve( + &self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> Result, GetActiveTimelineError> { + let tenant_id = self.tenant_id.get().expect("we set this in get()"); + let timeout = ACTIVE_TENANT_TIMEOUT; + let wait_start = Instant::now(); + let deadline = wait_start + timeout; + let tenant_shard = loop { + let resolved = self + .tenant_manager + .resolve_attached_shard(tenant_id, shard_selector); + match resolved { + ShardResolveResult::Found(tenant_shard) => break tenant_shard, + ShardResolveResult::NotFound => { + return Err(GetActiveTimelineError::Tenant( + GetActiveTenantError::NotFound(GetTenantError::NotFound(*tenant_id)), + )); + } + ShardResolveResult::InProgress(barrier) => { + // We can't authoritatively answer right now: wait for InProgress state + // to end, then try again + tokio::select! { + _ = barrier.wait() => { + // The barrier completed: proceed around the loop to try looking up again + }, + _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { + return Err(GetActiveTimelineError::Tenant(GetActiveTenantError::WaitForActiveTimeout { + latest_state: None, + wait_time: timeout, + })); + } + } + } + }; + }; + + tracing::debug!("Waiting for tenant to enter active state..."); + tenant_shard + .wait_to_become_active(deadline.duration_since(Instant::now())) + .await + .map_err(GetActiveTimelineError::Tenant)?; + + let timeline = tenant_shard + .get_timeline(timeline_id, true) + .map_err(GetActiveTimelineError::Timeline)?; + set_tracing_field_shard_id(&timeline); + Ok(timeline) + } } #[derive(thiserror::Error, Debug)] @@ -292,7 +471,11 @@ impl From for PageStreamError { impl From for PageStreamError { fn from(value: GetActiveTimelineError) -> Self { match value { - GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown, + GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) + | GetActiveTimelineError::Tenant(GetActiveTenantError::WillNotBecomeActive( + TenantState::Stopping { .. }, + )) + | GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) => Self::Shutdown, GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()), GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()), } @@ -324,64 +507,17 @@ impl PageServerHandler { tenant_manager: Arc, auth: Option>, connection_ctx: RequestContext, + cancel: CancellationToken, ) -> Self { PageServerHandler { - tenant_manager, auth, claims: None, connection_ctx, - shard_timelines: HashMap::new(), + timeline_handles: TimelineHandles::new(tenant_manager), + cancel, } } - /// Future that completes when we need to shut down the connection. - /// - /// We currently need to shut down when any of the following happens: - /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled - /// 2. task_mgr requests shutdown of the connection - /// - /// NB on (1): the connection's lifecycle is not actually tied to any of the - /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current - /// implementation to be responsive to timeline cancellation because - /// the connection holds their `GateGuards` open (sored in `shard_timelines`). - /// We currently do the easy thing and terminate the connection if any of the - /// shard_timelines gets cancelled. But really, we cuold spend more effort - /// and simply remove the cancelled timeline from the `shard_timelines`, thereby - /// dropping the guard. - /// - /// NB: keep in sync with [`Self::is_connection_cancelled`] - async fn await_connection_cancelled(&self) { - // A short wait before we expend the cycles to walk our timeline map. This avoids incurring - // that cost every time we check for cancellation. - tokio::time::sleep(Duration::from_millis(10)).await; - - // This function is never called concurrently with code that adds timelines to shard_timelines, - // which is enforced by the borrow checker (the future returned by this function carries the - // immutable &self). So it's fine to evaluate shard_timelines after the sleep, we don't risk - // missing any inserts to the map. - - let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len()); - use futures::future::Either; - cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher())); - cancellation_sources.extend( - self.shard_timelines - .values() - .map(|ht| Either::Right(ht.timeline.cancel.cancelled())), - ); - FuturesUnordered::from_iter(cancellation_sources) - .next() - .await; - } - - /// Checking variant of [`Self::await_connection_cancelled`]. - fn is_connection_cancelled(&self) -> bool { - task_mgr::is_shutdown_requested() - || self - .shard_timelines - .values() - .any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping()) - } - /// This function always respects cancellation of any timeline in `[Self::shard_timelines]`. Pass in /// a cancellation token at the next scope up (such as a tenant cancellation token) to ensure we respect /// cancellation if there aren't any timelines in the cache. @@ -400,15 +536,21 @@ impl PageServerHandler { flush_r = pgb.flush() => { Ok(flush_r?) }, - _ = self.await_connection_cancelled() => { - Err(QueryError::Shutdown) - } _ = cancel.cancelled() => { Err(QueryError::Shutdown) } ) } + /// Pagestream sub-protocol handler. + /// + /// It is a simple request-response protocol inside a COPYBOTH session. + /// + /// # Coding Discipline + /// + /// Coding discipline within this function: all interaction with the `pgb` connection + /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`]. + /// This is so that we can shutdown page_service quickly. #[instrument(skip_all)] async fn handle_pagerequests( &mut self, @@ -423,27 +565,27 @@ impl PageServerHandler { { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); - let tenant = self - .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT) - .await?; - // switch client to COPYBOTH pgb.write_message_noflush(&BeMessage::CopyBothResponse)?; - self.flush_cancellable(pgb, &tenant.cancel).await?; + tokio::select! { + biased; + _ = self.cancel.cancelled() => { + return Err(QueryError::Shutdown) + } + res = pgb.flush() => { + res?; + } + } loop { + // read request bytes (it's exactly 1 PagestreamFeMessage per CopyData) let msg = tokio::select! { biased; - - _ = self.await_connection_cancelled() => { - // We were requested to shut down. - info!("shutdown request received in page handler"); + _ = self.cancel.cancelled() => { return Err(QueryError::Shutdown) } - msg = pgb.read_message() => { msg } }; - let copy_data_bytes = match msg? { Some(FeMessage::CopyData(bytes)) => bytes, Some(FeMessage::Terminate) => break, @@ -458,13 +600,12 @@ impl PageServerHandler { trace!("query: {copy_data_bytes:?}"); fail::fail_point!("ps::handle-pagerequest-message"); + // parse request let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?; - // TODO: We could create a new per-request context here, with unique ID. - // Currently we use the same per-timeline context for all requests - - let (response, span) = match neon_fe_msg { + // invoke handler function + let (handler_result, span) = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { fail::fail_point!("ps::handle-pagerequest-message::exists"); let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); @@ -518,31 +659,26 @@ impl PageServerHandler { } }; - match response { - Err(PageStreamError::Shutdown) => { - // If we fail to fulfil a request during shutdown, which may be _because_ of - // shutdown, then do not send the error to the client. Instead just drop the - // connection. - span.in_scope(|| info!("dropping connection due to shutdown")); - return Err(QueryError::Shutdown); - } - Err(PageStreamError::Reconnect(reason)) => { - span.in_scope(|| info!("handler requested reconnect: {reason}")); - return Err(QueryError::Reconnect); - } - Err(e) if self.is_connection_cancelled() => { - // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean - // shutdown error, this may be buried inside a PageReconstructError::Other for example. - // - // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet, - // because wait_lsn etc will drop out - // is_stopping(): [`Timeline::flush_and_shutdown`] has entered - // is_canceled(): [`Timeline::shutdown`]` has entered - span.in_scope(|| info!("dropped error response during shutdown: {e:#}")); - return Err(QueryError::Shutdown); - } - r => { - let response_msg = r.unwrap_or_else(|e| { + // Map handler result to protocol behavior. + // Some handler errors cause exit from pagestream protocol. + // Other handler errors are sent back as an error message and we stay in pagestream protocol. + let response_msg = match handler_result { + Err(e) => match &e { + PageStreamError::Shutdown => { + // If we fail to fulfil a request during shutdown, which may be _because_ of + // shutdown, then do not send the error to the client. Instead just drop the + // connection. + span.in_scope(|| info!("dropping connection due to shutdown")); + return Err(QueryError::Shutdown); + } + PageStreamError::Reconnect(reason) => { + span.in_scope(|| info!("handler requested reconnect: {reason}")); + return Err(QueryError::Reconnect); + } + PageStreamError::Read(_) + | PageStreamError::LsnTimeout(_) + | PageStreamError::NotFound(_) + | PageStreamError::BadRequest(_) => { // print the all details to the log with {:#}, but for the client the // error message is enough. Do not log if shutting down, as the anyhow::Error // here includes cancellation which is not an error. @@ -553,10 +689,22 @@ impl PageServerHandler { PagestreamBeMessage::Error(PagestreamErrorResponse { message: e.to_string(), }) - }); + } + }, + Ok(response_msg) => response_msg, + }; - pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; - self.flush_cancellable(pgb, &tenant.cancel).await?; + // marshal & transmit response message + pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; + tokio::select! { + biased; + _ = self.cancel.cancelled() => { + // We were requested to shut down. + info!("shutdown request received in page handler"); + return Err(QueryError::Shutdown) + } + res = pgb.flush() => { + res?; } } } @@ -644,7 +792,7 @@ impl PageServerHandler { #[instrument(skip_all, fields(shard_id, %lsn))] async fn handle_make_lsn_lease( - &self, + &mut self, pgb: &mut PostgresBackend, tenant_shard_id: TenantShardId, timeline_id: TimelineId, @@ -654,10 +802,16 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - let shard_selector = ShardSelector::Known(tenant_shard_id.to_index()); let timeline = self - .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector) + .timeline_handles + .get( + tenant_shard_id.tenant_id, + timeline_id, + ShardSelector::Known(tenant_shard_id.to_index()), + ) .await?; + set_tracing_field_shard_id(&timeline); + let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?; let valid_until = lease .valid_until @@ -683,14 +837,17 @@ impl PageServerHandler { req: &PagestreamExistsRequest, ctx: &RequestContext, ) -> Result { - let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; let _timer = timeline .query_metrics .start_timer(metrics::SmgrQueryType::GetRelExists, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - timeline, + &timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -715,7 +872,10 @@ impl PageServerHandler { req: &PagestreamNblocksRequest, ctx: &RequestContext, ) -> Result { - let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; let _timer = timeline .query_metrics @@ -723,7 +883,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - timeline, + &timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -748,7 +908,10 @@ impl PageServerHandler { req: &PagestreamDbSizeRequest, ctx: &RequestContext, ) -> Result { - let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; let _timer = timeline .query_metrics @@ -756,7 +919,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - timeline, + &timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -774,122 +937,6 @@ impl PageServerHandler { })) } - /// For most getpage requests, we will already have a Timeline to serve the request: this function - /// looks up such a Timeline synchronously and without touching any global state. - fn get_cached_timeline_for_page( - &mut self, - req: &PagestreamGetPageRequest, - ) -> Result<&Arc, Key> { - let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() { - // Fastest path: single sharded case - if first_idx.shard_count.count() == 1 { - return Ok(&first_timeline.timeline); - } - - let key = rel_block_to_key(req.rel, req.blkno); - let shard_num = first_timeline - .timeline - .get_shard_identity() - .get_shard_number(&key); - - // Fast path: matched the first timeline in our local handler map. This case is common if - // only one shard per tenant is attached to this pageserver. - if first_timeline.timeline.get_shard_identity().number == shard_num { - return Ok(&first_timeline.timeline); - } - - let shard_index = ShardIndex { - shard_number: shard_num, - shard_count: first_timeline.timeline.get_shard_identity().count, - }; - - // Fast-ish path: timeline is in the connection handler's local cache - if let Some(found) = self.shard_timelines.get(&shard_index) { - return Ok(&found.timeline); - } - - key - } else { - rel_block_to_key(req.rel, req.blkno) - }; - - Err(key) - } - - /// Having looked up the [`Timeline`] instance for a particular shard, cache it to enable - /// use in future requests without having to traverse [`crate::tenant::mgr::TenantManager`] - /// again. - /// - /// Note that all the Timelines in this cache are for the same timeline_id: they're differ - /// in which shard they belong to. When we serve a getpage@lsn request, we choose a shard - /// based on key. - /// - /// The typical size of this cache is 1, as we generally create shards to distribute work - /// across pageservers, so don't tend to have multiple shards for the same tenant on the - /// same pageserver. - fn cache_timeline( - &mut self, - timeline: Arc, - ) -> Result<&Arc, GetActiveTimelineError> { - let gate_guard = timeline - .gate - .enter() - .map_err(|_| GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled))?; - - let shard_index = timeline.tenant_shard_id.to_index(); - let entry = self - .shard_timelines - .entry(shard_index) - .or_insert(HandlerTimeline { - timeline, - _guard: gate_guard, - }); - - Ok(&entry.timeline) - } - - /// If [`Self::get_cached_timeline_for_page`] missed, then this function is used to populate the cache with - /// a Timeline to serve requests for this key, if such a Timeline is present on this pageserver. If no such - /// Timeline is found, then we will return an error (this indicates that the client is talking to the wrong node). - async fn load_timeline_for_page( - &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, - key: Key, - ) -> anyhow::Result<&Arc, GetActiveTimelineError> { - // Slow path: we must call out to the TenantManager to find the timeline for this Key - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Page(key)) - .await?; - - self.cache_timeline(timeline) - } - - async fn get_timeline_shard_zero( - &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, - ) -> anyhow::Result<&Arc, GetActiveTimelineError> { - // This is a borrow-checker workaround: we can't return from inside of the `if let Some` because - // that would be an immutable-borrow-self return, whereas later in the function we will use a mutable - // ref to salf. So instead, we first build a bool, and then return while not borrowing self. - let have_cached = if let Some((idx, _tl)) = self.shard_timelines.iter().next() { - idx.shard_number == ShardNumber(0) - } else { - false - }; - - if have_cached { - let entry = self.shard_timelines.iter().next().unwrap(); - Ok(&entry.1.timeline) - } else { - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - Ok(self.cache_timeline(timeline)?) - } - } - #[instrument(skip_all, fields(shard_id))] async fn handle_get_page_at_lsn_request( &mut self, @@ -898,33 +945,30 @@ impl PageServerHandler { req: &PagestreamGetPageRequest, ctx: &RequestContext, ) -> Result { - let timeline = match self.get_cached_timeline_for_page(req) { - Ok(tl) => { - set_tracing_field_shard_id(tl); - tl - } - Err(key) => { - match self - .load_timeline_for_page(tenant_id, timeline_id, key) - .await - { - Ok(t) => t, - Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { - // We already know this tenant exists in general, because we resolved it at - // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node: the client's knowledge of shard->pageserver - // mapping is out of date. - // - // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via - // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration - // and talk to a different pageserver. - return Err(PageStreamError::Reconnect( - "getpage@lsn request routed to wrong shard".into(), - )); - } - Err(e) => return Err(e.into()), - } + let timeline = match self + .timeline_handles + .get( + tenant_id, + timeline_id, + ShardSelector::Page(rel_block_to_key(req.rel, req.blkno)), + ) + .await + { + Ok(tl) => tl, + Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { + // We already know this tenant exists in general, because we resolved it at + // start of connection. Getting a NotFound here indicates that the shard containing + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + // + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return Err(PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into(), + )); } + Err(e) => return Err(e.into()), }; let _timer = timeline @@ -933,7 +977,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - timeline, + &timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -958,7 +1002,10 @@ impl PageServerHandler { req: &PagestreamGetSlruSegmentRequest, ctx: &RequestContext, ) -> Result { - let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; let _timer = timeline .query_metrics @@ -966,7 +1013,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - timeline, + &timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -987,6 +1034,15 @@ impl PageServerHandler { /// Full basebackups should only be used for debugging purposes. /// Originally, it was introduced to enable breaking storage format changes, /// but that is not applicable anymore. + /// + /// # Coding Discipline + /// + /// Coding discipline within this function: all interaction with the `pgb` connection + /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`]. + /// This is so that we can shutdown page_service quickly. + /// + /// TODO: wrap the pgb that we pass to the basebackup handler so that it's sensitive + /// to connection cancellation. #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))] async fn handle_basebackup_request( @@ -1012,10 +1068,11 @@ impl PageServerHandler { let started = std::time::Instant::now(); - // check that the timeline exists let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. @@ -1037,7 +1094,7 @@ impl PageServerHandler { // switch client to COPYOUT pgb.write_message_noflush(&BeMessage::CopyOutResponse) .map_err(QueryError::Disconnected)?; - self.flush_cancellable(pgb, &timeline.cancel).await?; + self.flush_cancellable(pgb, &self.cancel).await?; // Send a tarball of the latest layer on the timeline. Compress if not // fullbackup. TODO Compress in that case too (tests need to be updated) @@ -1128,77 +1185,6 @@ impl PageServerHandler { .expect("claims presence already checked"); check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0)) } - - /// Shorthand for getting a reference to a Timeline of an Active tenant. - async fn get_active_tenant_timeline( - &self, - tenant_id: TenantId, - timeline_id: TimelineId, - selector: ShardSelector, - ) -> Result, GetActiveTimelineError> { - let tenant = self - .get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT) - .await - .map_err(GetActiveTimelineError::Tenant)?; - let timeline = tenant.get_timeline(timeline_id, true)?; - set_tracing_field_shard_id(&timeline); - Ok(timeline) - } - - /// Get a shard's [`Tenant`] in its active state, if present. If we don't find the shard and some - /// slots for this tenant are `InProgress` then we will wait. - /// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait. - /// - /// `timeout` is used as a total timeout for the whole wait operation. - async fn get_active_tenant_with_timeout( - &self, - tenant_id: TenantId, - shard_selector: ShardSelector, - timeout: Duration, - ) -> Result, GetActiveTenantError> { - let wait_start = Instant::now(); - let deadline = wait_start + timeout; - - // Resolve TenantId to TenantShardId. This is usually a quick one-shot thing, the loop is - // for handling the rare case that the slot we're accessing is InProgress. - let tenant_shard = loop { - let resolved = self - .tenant_manager - .resolve_attached_shard(&tenant_id, shard_selector); - match resolved { - ShardResolveResult::Found(tenant_shard) => break tenant_shard, - ShardResolveResult::NotFound => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound( - tenant_id, - ))); - } - ShardResolveResult::InProgress(barrier) => { - // We can't authoritatively answer right now: wait for InProgress state - // to end, then try again - tokio::select! { - _ = self.await_connection_cancelled() => { - return Err(GetActiveTenantError::Cancelled) - }, - _ = barrier.wait() => { - // The barrier completed: proceed around the loop to try looking up again - }, - _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { - return Err(GetActiveTenantError::WaitForActiveTimeout { - latest_state: None, - wait_time: timeout, - }); - } - } - } - }; - }; - - tracing::debug!("Waiting for tenant to enter active state..."); - tenant_shard - .wait_to_become_active(deadline.duration_since(Instant::now())) - .await?; - Ok(tenant_shard) - } } #[async_trait::async_trait] @@ -1505,7 +1491,7 @@ impl From for QueryError { } #[derive(Debug, thiserror::Error)] -enum GetActiveTimelineError { +pub(crate) enum GetActiveTimelineError { #[error(transparent)] Tenant(GetActiveTenantError), #[error(transparent)] diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 5a334d0290..e4ebafd927 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -8,8 +8,7 @@ use std::time::Duration; pub use pageserver_api::key::{Key, KEY_SIZE}; /// A 'value' stored for a one Key. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(test, derive(PartialEq))] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum Value { /// An Image value contains a full copy of the value Image(Bytes), diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f83c7021e3..2422ab4cf2 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -33,6 +33,7 @@ use remote_storage::GenericRemoteStorage; use remote_storage::TimeoutOrCancel; use std::collections::BTreeMap; use std::fmt; +use std::sync::Weak; use std::time::SystemTime; use storage_broker::BrokerClientChannel; use tokio::io::BufReader; @@ -102,8 +103,7 @@ use std::fmt::Debug; use std::fmt::Display; use std::fs; use std::fs::File; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use std::sync::Mutex; use std::time::{Duration, Instant}; @@ -148,6 +148,7 @@ pub(crate) mod timeline; pub mod size; +mod gc_block; pub(crate) mod throttle; pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; @@ -303,6 +304,12 @@ pub struct Tenant { /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. ongoing_timeline_detach: std::sync::Mutex>, + /// `index_part.json` based gc blocking reason tracking. + /// + /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before + /// proceeding. + pub(crate) gc_block: gc_block::GcBlock, + l0_flush_global_state: L0FlushGlobalState, } @@ -313,14 +320,66 @@ impl std::fmt::Debug for Tenant { } pub(crate) enum WalRedoManager { - Prod(PostgresRedoManager), + Prod(WalredoManagerId, PostgresRedoManager), #[cfg(test)] Test(harness::TestRedoManager), } -impl From for WalRedoManager { - fn from(mgr: PostgresRedoManager) -> Self { - Self::Prod(mgr) +#[derive(thiserror::Error, Debug)] +#[error("pageserver is shutting down")] +pub(crate) struct GlobalShutDown; + +impl WalRedoManager { + pub(crate) fn new(mgr: PostgresRedoManager) -> Result, GlobalShutDown> { + let id = WalredoManagerId::next(); + let arc = Arc::new(Self::Prod(id, mgr)); + let mut guard = WALREDO_MANAGERS.lock().unwrap(); + match &mut *guard { + Some(map) => { + map.insert(id, Arc::downgrade(&arc)); + Ok(arc) + } + None => Err(GlobalShutDown), + } + } +} + +impl Drop for WalRedoManager { + fn drop(&mut self) { + match self { + Self::Prod(id, _) => { + let mut guard = WALREDO_MANAGERS.lock().unwrap(); + if let Some(map) = &mut *guard { + map.remove(id).expect("new() registers, drop() unregisters"); + } + } + #[cfg(test)] + Self::Test(_) => { + // Not applicable to test redo manager + } + } + } +} + +/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down +/// the walredo processes outside of the regular order. +/// +/// This is necessary to work around a systemd bug where it freezes if there are +/// walredo processes left => +#[allow(clippy::type_complexity)] +pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy< + Mutex>>>, +> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new()))); +#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)] +pub(crate) struct WalredoManagerId(u64); +impl WalredoManagerId { + pub fn next() -> Self { + static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); + let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + if id == 0 { + panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique"); + } + Self(id) } } @@ -332,19 +391,20 @@ impl From for WalRedoManager { } impl WalRedoManager { - pub(crate) async fn shutdown(&self) { + pub(crate) async fn shutdown(&self) -> bool { match self { - Self::Prod(mgr) => mgr.shutdown().await, + Self::Prod(_, mgr) => mgr.shutdown().await, #[cfg(test)] Self::Test(_) => { // Not applicable to test redo manager + true } } } pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) { match self { - Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout), + Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout), #[cfg(test)] Self::Test(_) => { // Not applicable to test redo manager @@ -364,7 +424,7 @@ impl WalRedoManager { pg_version: u32, ) -> Result { match self { - Self::Prod(mgr) => { + Self::Prod(_, mgr) => { mgr.request_redo(key, lsn, base_img, records, pg_version) .await } @@ -378,7 +438,7 @@ impl WalRedoManager { pub(crate) fn status(&self) -> Option { match self { - WalRedoManager::Prod(m) => Some(m.status()), + WalRedoManager::Prod(_, m) => Some(m.status()), #[cfg(test)] WalRedoManager::Test(_) => None, } @@ -387,6 +447,8 @@ impl WalRedoManager { #[derive(Debug, thiserror::Error, PartialEq, Eq)] pub enum GetTimelineError { + #[error("Timeline is shutting down")] + ShuttingDown, #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")] NotActive { tenant_id: TenantShardId, @@ -539,6 +601,12 @@ impl From for GcError { } } +impl From for GcError { + fn from(_: timeline::layer_manager::Shutdown) -> Self { + GcError::TimelineCancelled + } +} + #[derive(thiserror::Error, Debug)] pub(crate) enum LoadConfigError { #[error("TOML deserialization error: '{0}'")] @@ -648,6 +716,7 @@ impl Tenant { .read() .await .layer_map() + .expect("currently loading, layer manager cannot be shutdown already") .iter_historic_layers() .next() .is_some(), @@ -676,11 +745,9 @@ impl Tenant { init_order: Option, mode: SpawnMode, ctx: &RequestContext, - ) -> Arc { - let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, - tenant_shard_id, - ))); + ) -> Result, GlobalShutDown> { + let wal_redo_manager = + WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?; let TenantSharedResources { broker_client, @@ -879,7 +946,7 @@ impl Tenant { } .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)), ); - tenant + Ok(tenant) } #[instrument(skip_all)] @@ -983,6 +1050,8 @@ impl Tenant { } } + let mut gc_blocks = HashMap::new(); + // For every timeline, download the metadata file, scan the local directory, // and build a layer map that contains an entry for each remote and local // layer file. @@ -992,6 +1061,16 @@ impl Tenant { .remove(&timeline_id) .expect("just put it in above"); + if let Some(blocking) = index_part.gc_blocking.as_ref() { + // could just filter these away, but it helps while testing + anyhow::ensure!( + !blocking.reasons.is_empty(), + "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons" + ); + let prev = gc_blocks.insert(timeline_id, blocking.reasons); + assert!(prev.is_none()); + } + // TODO again handle early failure self.load_remote_timeline( timeline_id, @@ -1036,6 +1115,8 @@ impl Tenant { // IndexPart is the source of truth. self.clean_up_timelines(&existent_timelines)?; + self.gc_block.set_scanned(gc_blocks); + fail::fail_point!("attach-before-activate", |_| { anyhow::bail!("attach-before-activate"); }); @@ -1227,11 +1308,29 @@ impl Tenant { Ok(timeline_preloads) } - pub async fn apply_timeline_archival_config( + pub(crate) async fn apply_timeline_archival_config( &self, - _timeline_id: TimelineId, - _config: TimelineArchivalState, + timeline_id: TimelineId, + state: TimelineArchivalState, ) -> anyhow::Result<()> { + let timeline = self + .get_timeline(timeline_id, false) + .context("Cannot apply timeline archival config to inexistent timeline")?; + + let upload_needed = timeline + .remote_client + .schedule_index_upload_for_timeline_archival_state(state)?; + + if upload_needed { + const MAX_WAIT: Duration = Duration::from_secs(10); + let Ok(v) = + tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await + else { + tracing::warn!("reached timeout for waiting on upload queue"); + bail!("reached timeout for upload queue flush"); + }; + v?; + } Ok(()) } @@ -1563,7 +1662,7 @@ impl Tenant { self: Arc, timeline_id: TimelineId, ) -> Result<(), DeleteTimelineError> { - DeleteTimelineFlow::run(&self, timeline_id, false).await?; + DeleteTimelineFlow::run(&self, timeline_id).await?; Ok(()) } @@ -1608,6 +1707,14 @@ impl Tenant { } } + let _guard = match self.gc_block.start().await { + Ok(guard) => guard, + Err(reasons) => { + info!("Skipping GC: {reasons}"); + return Ok(GcResult::default()); + } + }; + self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx) .await } @@ -1616,21 +1723,23 @@ impl Tenant { /// This function is periodically called by compactor task. /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. + /// + /// Returns whether we have pending compaction task. async fn compaction_iteration( &self, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result<(), timeline::CompactionError> { + ) -> Result { // Don't start doing work during shutdown, or when broken, we do not need those in the logs if !self.is_active() { - return Ok(()); + return Ok(false); } { let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); - return Ok(()); + return Ok(false); } } @@ -1657,20 +1766,24 @@ impl Tenant { // Before doing any I/O work, check our circuit breaker if self.compaction_circuit_breaker.lock().unwrap().is_broken() { info!("Skipping compaction due to previous failures"); - return Ok(()); + return Ok(false); } + let mut has_pending_task = false; + for (timeline_id, timeline) in &timelines_to_compact { - timeline + has_pending_task |= timeline .compact(cancel, EnumSet::empty(), ctx) .instrument(info_span!("compact_timeline", %timeline_id)) .await - .map_err(|e| { - self.compaction_circuit_breaker - .lock() - .unwrap() - .fail(&CIRCUIT_BREAKERS_BROKEN, &e); - e + .inspect_err(|e| match e { + timeline::CompactionError::ShuttingDown => (), + timeline::CompactionError::Other(e) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, e); + } })?; } @@ -1679,7 +1792,7 @@ impl Tenant { .unwrap() .success(&CIRCUIT_BREAKERS_UNBROKEN); - Ok(()) + Ok(has_pending_task) } // Call through to all timelines to freeze ephemeral layers if needed. Usually @@ -2614,6 +2727,7 @@ impl Tenant { )), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), + gc_block: Default::default(), l0_flush_global_state, } } @@ -4015,7 +4129,7 @@ pub(crate) mod harness { #[cfg(test)] mod tests { - use std::collections::BTreeMap; + use std::collections::{BTreeMap, BTreeSet}; use super::*; use crate::keyspace::KeySpaceAccum; @@ -4567,10 +4681,10 @@ mod tests { let layer_map = tline.layers.read().await; let level0_deltas = layer_map - .layer_map() - .get_level0_deltas()? - .into_iter() - .map(|desc| layer_map.get_from_desc(&desc)) + .layer_map()? + .level0_deltas() + .iter() + .map(|desc| layer_map.get_from_desc(desc)) .collect::>(); assert!(!level0_deltas.is_empty()); @@ -4690,7 +4804,7 @@ mod tests { lsn: Lsn, repeat: usize, key_count: usize, - ) -> anyhow::Result<()> { + ) -> anyhow::Result>> { let compact = true; bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await } @@ -4703,7 +4817,9 @@ mod tests { repeat: usize, key_count: usize, compact: bool, - ) -> anyhow::Result<()> { + ) -> anyhow::Result>> { + let mut inserted: HashMap> = Default::default(); + let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let mut blknum = 0; @@ -4724,6 +4840,7 @@ mod tests { ctx, ) .await?; + inserted.entry(test_key).or_default().insert(lsn); writer.finish_write(lsn); drop(writer); @@ -4748,7 +4865,7 @@ mod tests { assert_eq!(res.layers_removed, 0, "this never removes anything"); } - Ok(()) + Ok(inserted) } // @@ -4795,14 +4912,16 @@ mod tests { .await?; let lsn = Lsn(0x10); - bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; + let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; let guard = tline.layers.read().await; - guard.layer_map().dump(true, &ctx).await?; + let lm = guard.layer_map()?; + + lm.dump(true, &ctx).await?; let mut reads = Vec::new(); let mut prev = None; - guard.layer_map().iter_historic_layers().for_each(|desc| { + lm.iter_historic_layers().for_each(|desc| { if !desc.is_delta() { prev = Some(desc.clone()); return; @@ -4856,9 +4975,39 @@ mod tests { &ctx, ) .await; - tline - .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx) - .await; + + let mut expected_lsns: HashMap = Default::default(); + let mut expect_missing = false; + let mut key = read.start().unwrap(); + while key != read.end().unwrap() { + if let Some(lsns) = inserted.get(&key) { + let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn); + match expected_lsn { + Some(lsn) => { + expected_lsns.insert(key, *lsn); + } + None => { + expect_missing = true; + break; + } + } + } else { + expect_missing = true; + break; + } + + key = key.next(); + } + + if expect_missing { + assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_)))); + } else { + for (key, image) in vectored_res? { + let expected_lsn = expected_lsns.get(&key).expect("determined above"); + let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn)); + assert_eq!(image?, expected_image); + } + } } Ok(()) @@ -4908,10 +5057,6 @@ mod tests { ) .await; - child_timeline - .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx) - .await; - let images = vectored_res?; assert!(images.is_empty()); Ok(()) @@ -5782,23 +5927,12 @@ mod tests { tline.freeze_and_flush().await?; // force create a delta layer } - let before_num_l0_delta_files = tline - .layers - .read() - .await - .layer_map() - .get_level0_deltas()? - .len(); + let before_num_l0_delta_files = + tline.layers.read().await.layer_map()?.level0_deltas().len(); tline.compact(&cancel, EnumSet::empty(), &ctx).await?; - let after_num_l0_delta_files = tline - .layers - .read() - .await - .layer_map() - .get_level0_deltas()? - .len(); + let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); @@ -6822,7 +6956,10 @@ mod tests { } let cancel = CancellationToken::new(); - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); for (idx, expected) in expected_result.iter().enumerate() { assert_eq!( @@ -6886,7 +7023,11 @@ mod tests { vec![ // Image layer at GC horizon PersistentLayerKey { - key_range: Key::MIN..Key::MAX, + key_range: { + let mut key = Key::MAX; + key.field6 -= 1; + Key::MIN..key + }, lsn_range: Lsn(0x30)..Lsn(0x31), is_delta: false }, @@ -6905,6 +7046,18 @@ mod tests { ] ); + // increase GC horizon and compact again + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.time = Lsn(0x40); + guard.cutoffs.space = Lsn(0x40); + } + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + Ok(()) } @@ -7237,7 +7390,10 @@ mod tests { } let cancel = CancellationToken::new(); - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); for idx in 0..10 { assert_eq!( @@ -7256,6 +7412,18 @@ mod tests { ); } + // increase GC horizon and compact again + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.time = Lsn(0x40); + guard.cutoffs.space = Lsn(0x40); + } + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + Ok(()) } @@ -7307,7 +7475,9 @@ mod tests { ( key, Lsn(0x80), - Value::WalRecord(NeonWalRecord::wal_append(";0x80")), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), ), ( key, @@ -7322,6 +7492,7 @@ mod tests { Lsn(0x60), &[Lsn(0x20), Lsn(0x40), Lsn(0x50)], 3, + None, ) .await .unwrap(); @@ -7369,7 +7540,9 @@ mod tests { ), ( Lsn(0x80), - Value::WalRecord(NeonWalRecord::wal_append(";0x80")), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), ), ( Lsn(0x90), @@ -7378,7 +7551,226 @@ mod tests { ]), }; assert_eq!(res, expected_res); - // TODO: more tests with mixed image + delta, adding with k-merge test cases; e2e compaction test + + // We expect GC-compaction to run with the original GC. This would create a situation that + // the original GC algorithm removes some delta layers b/c there are full image coverage, + // therefore causing some keys to have an incomplete history below the lowest retain LSN. + // For example, we have + // ```plain + // init delta @ 0x10, image @ 0x20, delta @ 0x30 (gc_horizon), image @ 0x40. + // ``` + // Now the GC horizon moves up, and we have + // ```plain + // init delta @ 0x10, image @ 0x20, delta @ 0x30, image @ 0x40 (gc_horizon) + // ``` + // The original GC algorithm kicks in, and removes delta @ 0x10, image @ 0x20. + // We will end up with + // ```plain + // delta @ 0x30, image @ 0x40 (gc_horizon) + // ``` + // Now we run the GC-compaction, and this key does not have a full history. + // We should be able to handle this partial history and drop everything before the + // gc_horizon image. + + let history = vec![ + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + key, + Lsn(0x40), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), + ), + ( + key, + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + ), + ( + key, + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + key, + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + key, + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]; + let res = tline + .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![ + ( + Lsn(0x40), + KeyLogAtLsn(vec![( + Lsn(0x40), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), + )]), + ), + ( + Lsn(0x50), + KeyLogAtLsn(vec![( + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + )]), + ), + ( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + )]), + ), + ], + above_horizon: KeyLogAtLsn(vec![ + ( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]), + }; + assert_eq!(res, expected_res); + + // In case of branch compaction, the branch itself does not have the full history, and we need to provide + // the ancestor image in the test case. + + let history = vec![ + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + key, + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ]; + let res = tline + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[], + 3, + Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + ) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page + )]), + )], + above_horizon: KeyLogAtLsn(vec![( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + )]), + }; + assert_eq!(res, expected_res); + + let history = vec![ + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ( + key, + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ]; + let res = tline + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[Lsn(0x30)], + 3, + Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + ) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![ + ( + Lsn(0x30), + KeyLogAtLsn(vec![( + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + )]), + ), + ( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")), + )]), + ), + ], + above_horizon: KeyLogAtLsn(vec![( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + )]), + }; + assert_eq!(res, expected_res); + Ok(()) } @@ -7536,6 +7928,10 @@ mod tests { ]; let verify_result = || async { + let gc_horizon = { + let gc_info = tline.gc_info.read().unwrap(); + gc_info.cutoffs.time + }; for idx in 0..10 { assert_eq!( tline @@ -7546,7 +7942,7 @@ mod tests { ); assert_eq!( tline - .get(get_key(idx as u32), Lsn(0x30), &ctx) + .get(get_key(idx as u32), gc_horizon, &ctx) .await .unwrap(), &expected_result_at_gc_horizon[idx] @@ -7571,7 +7967,232 @@ mod tests { verify_result().await; let cancel = CancellationToken::new(); - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + let mut dryrun_flags = EnumSet::new(); + dryrun_flags.insert(CompactFlags::DryRun); + + tline + .compact_with_gc(&cancel, dryrun_flags, &ctx) + .await + .unwrap(); + // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs + // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests. + verify_result().await; + + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + // compact again + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + // increase GC horizon and compact again + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.time = Lsn(0x38); + guard.cutoffs.space = Lsn(0x38); + } + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result + + // not increasing the GC horizon and compact again + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + Ok(()) + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + ), + ( + get_key(3), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let parent_tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![], // delta layers + vec![(Lsn(0x18), img_layer)], // image layers + Lsn(0x18), + ) + .await?; + + parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10))); + + let branch_tline = tenant + .branch_timeline_test_with_layers( + &parent_tline, + NEW_TIMELINE_ID, + Some(Lsn(0x18)), + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers + vec![], // image layers + Lsn(0x50), + ) + .await?; + + branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10))); + + { + // Update GC info + let mut guard = parent_tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)], + cutoffs: GcCutoffs { + time: Lsn(0x10), + space: Lsn(0x10), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + { + // Update GC info + let mut guard = branch_tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)], + cutoffs: GcCutoffs { + time: Lsn(0x50), + space: Lsn(0x50), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_lsn_40 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let verify_result = || async { + for idx in 0..10 { + assert_eq!( + branch_tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + assert_eq!( + branch_tline + .get(get_key(idx as u32), Lsn(0x40), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_40[idx] + ); + } + }; + + verify_result().await; + + let cancel = CancellationToken::new(); + branch_tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); verify_result().await; diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 791eefebe9..8e9d349ca8 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -28,6 +28,12 @@ use crate::virtual_file::VirtualFile; use std::cmp::min; use std::io::{Error, ErrorKind}; +#[derive(Copy, Clone, Debug)] +pub struct CompressionInfo { + pub written_compressed: bool, + pub compressed_size: Option, +} + impl<'a> BlockCursor<'a> { /// Read a blob into a new buffer. pub async fn read_blob( @@ -273,8 +279,10 @@ impl BlobWriter { srcbuf: B, ctx: &RequestContext, ) -> (B::Buf, Result) { - self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled) - .await + let (buf, res) = self + .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled) + .await; + (buf, res.map(|(off, _compression_info)| off)) } /// Write a blob of data. Returns the offset that it was written to, @@ -284,8 +292,12 @@ impl BlobWriter { srcbuf: B, ctx: &RequestContext, algorithm: ImageCompressionAlgorithm, - ) -> (B::Buf, Result) { + ) -> (B::Buf, Result<(u64, CompressionInfo), Error>) { let offset = self.offset; + let mut compression_info = CompressionInfo { + written_compressed: false, + compressed_size: None, + }; let len = srcbuf.bytes_init(); @@ -328,7 +340,9 @@ impl BlobWriter { encoder.write_all(&slice[..]).await.unwrap(); encoder.shutdown().await.unwrap(); let compressed = encoder.into_inner(); + compression_info.compressed_size = Some(compressed.len()); if compressed.len() < len { + compression_info.written_compressed = true; let compressed_len = compressed.len(); compressed_buf = Some(compressed); (BYTE_ZSTD, compressed_len, slice.into_inner()) @@ -359,7 +373,7 @@ impl BlobWriter { } else { self.write_all(srcbuf, ctx).await }; - (srcbuf, res.map(|_| offset)) + (srcbuf, res.map(|_| (offset, compression_info))) } } @@ -416,12 +430,14 @@ pub(crate) mod tests { let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { let (_, res) = if compression { - wtr.write_blob_maybe_compressed( - blob.clone(), - ctx, - ImageCompressionAlgorithm::Zstd { level: Some(1) }, - ) - .await + let res = wtr + .write_blob_maybe_compressed( + blob.clone(), + ctx, + ImageCompressionAlgorithm::Zstd { level: Some(1) }, + ) + .await; + (res.0, res.1.map(|(off, _)| off)) } else { wtr.write_blob(blob.clone(), ctx).await }; diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 1583a3826a..0107b0ac7e 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -296,13 +296,19 @@ where let mut stack = Vec::new(); stack.push((self.root_blk, None)); let block_cursor = self.reader.block_cursor(); + let mut node_buf = [0_u8; PAGE_SZ]; while let Some((node_blknum, opt_iter)) = stack.pop() { - // Locate the node. - let node_buf = block_cursor + // Read the node, through the PS PageCache, into local variable `node_buf`. + // We could keep the page cache read guard alive, but, at the time of writing, + // we run quite small PS PageCache s => can't risk running out of + // PageCache space because this stream isn't consumed fast enough. + let page_read_guard = block_cursor .read_blk(self.start_blk + node_blknum, ctx) .await?; + node_buf.copy_from_slice(page_read_guard.as_ref()); + drop(page_read_guard); // drop page cache read guard early - let node = OnDiskNode::deparse(node_buf.as_ref())?; + let node = OnDiskNode::deparse(&node_buf)?; let prefix_len = node.prefix_len as usize; let suffix_len = node.suffix_len as usize; @@ -345,6 +351,7 @@ where Either::Left(idx..node.num_children.into()) }; + // idx points to the first match now. Keep going from there while let Some(idx) = iter.next() { let key_off = idx * suffix_len; diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index bb65ae24fc..770f3ca5f0 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -29,6 +29,7 @@ impl EphemeralFile { conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + gate_guard: utils::sync::gate::GateGuard, ctx: &RequestContext, ) -> Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); @@ -51,10 +52,12 @@ impl EphemeralFile { ) .await?; + let prewarm = conf.l0_flush.prewarm_on_write(); + Ok(EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()), + rw: page_caching::RW::new(file, prewarm, gate_guard), }) } @@ -161,7 +164,11 @@ mod tests { async fn test_ephemeral_blobs() -> Result<(), io::Error> { let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?; + let gate = utils::sync::gate::Gate::default(); + + let entered = gate.enter().unwrap(); + + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?; let pos_foo = file.write_blob(b"foo", &ctx).await?; assert_eq!( @@ -215,4 +222,38 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn ephemeral_file_holds_gate_open() { + const FOREVER: std::time::Duration = std::time::Duration::from_secs(5); + + let (conf, tenant_id, timeline_id, ctx) = + harness("ephemeral_file_holds_gate_open").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let mut closing = tokio::task::spawn(async move { + gate.close().await; + }); + + // gate is entered until the ephemeral file is dropped + // do not start paused tokio-epoll-uring has a sleep loop + tokio::time::pause(); + tokio::time::timeout(FOREVER, &mut closing) + .await + .expect_err("closing cannot complete before dropping"); + + // this is a requirement of the reset_tenant functionality: we have to be able to restart a + // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate + drop(file); + + tokio::time::timeout(FOREVER, &mut closing) + .await + .expect("closing completes right away") + .expect("closing does not panic"); + } } diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 43b9fff28d..0a12b64a7c 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -18,6 +18,8 @@ use super::zero_padded_read_write; pub struct RW { page_cache_file_id: page_cache::FileId, rw: super::zero_padded_read_write::RW, + /// Gate guard is held on as long as we need to do operations in the path (delete on drop). + _gate_guard: utils::sync::gate::GateGuard, } /// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`], @@ -29,7 +31,11 @@ pub enum PrewarmOnWrite { } impl RW { - pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self { + pub fn new( + file: VirtualFile, + prewarm_on_write: PrewarmOnWrite, + _gate_guard: utils::sync::gate::GateGuard, + ) -> Self { let page_cache_file_id = page_cache::next_file_id(); Self { page_cache_file_id, @@ -38,6 +44,7 @@ impl RW { file, prewarm_on_write, )), + _gate_guard, } } @@ -145,6 +152,7 @@ impl Drop for RW { // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. // unlink the file + // we are clear to do this, because we have entered a gate let res = std::fs::remove_file(&self.rw.as_writer().file.path); if let Err(e) = res { if e.kind() != std::io::ErrorKind::NotFound { diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs new file mode 100644 index 0000000000..8b41ba1746 --- /dev/null +++ b/pageserver/src/tenant/gc_block.rs @@ -0,0 +1,213 @@ +use std::collections::HashMap; + +use utils::id::TimelineId; + +use super::remote_timeline_client::index::GcBlockingReason; + +type Storage = HashMap>; + +#[derive(Default)] +pub(crate) struct GcBlock { + /// The timelines which have current reasons to block gc. + /// + /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done + /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`. + reasons: std::sync::Mutex, + blocking: tokio::sync::Mutex<()>, +} + +impl GcBlock { + /// Start another gc iteration. + /// + /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with + /// it's ending, or if not currently possible, a value describing the reasons why not. + /// + /// Cancellation safe. + pub(super) async fn start(&self) -> Result, BlockingReasons> { + let reasons = { + let g = self.reasons.lock().unwrap(); + + // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in + // tests, we use everything. we should warn if the gc has been consecutively blocked + // for more than 1h (within single tenant session?). + BlockingReasons::clean_and_summarize(g) + }; + + if let Some(reasons) = reasons { + Err(reasons) + } else { + Ok(Guard { + _inner: self.blocking.lock().await, + }) + } + } + + pub(crate) fn summary(&self) -> Option { + let g = self.reasons.lock().unwrap(); + + BlockingReasons::summarize(&g) + } + + /// Start blocking gc for this one timeline for the given reason. + /// + /// This is not a guard based API but instead it mimics set API. The returned future will not + /// resolve until an existing gc round has completed. + /// + /// Returns true if this block was new, false if gc was already blocked for this reason. + /// + /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will + /// keep the gc blocking reason. + pub(crate) async fn insert( + &self, + timeline: &super::Timeline, + reason: GcBlockingReason, + ) -> anyhow::Result { + let (added, uploaded) = { + let mut g = self.reasons.lock().unwrap(); + let set = g.entry(timeline.timeline_id).or_default(); + let added = set.insert(reason); + + // LOCK ORDER: intentionally hold the lock, see self.reasons. + let uploaded = timeline + .remote_client + .schedule_insert_gc_block_reason(reason)?; + + (added, uploaded) + }; + + uploaded.await?; + + // ensure that any ongoing gc iteration has completed + drop(self.blocking.lock().await); + + Ok(added) + } + + /// Remove blocking gc for this one timeline and the given reason. + pub(crate) async fn remove( + &self, + timeline: &super::Timeline, + reason: GcBlockingReason, + ) -> anyhow::Result<()> { + use std::collections::hash_map::Entry; + + super::span::debug_assert_current_span_has_tenant_and_timeline_id(); + + let (remaining_blocks, uploaded) = { + let mut g = self.reasons.lock().unwrap(); + match g.entry(timeline.timeline_id) { + Entry::Occupied(mut oe) => { + let set = oe.get_mut(); + set.remove(reason); + if set.is_empty() { + oe.remove(); + } + } + Entry::Vacant(_) => { + // we must still do the index_part.json update regardless, in case we had earlier + // been cancelled + } + } + + let remaining_blocks = g.len(); + + // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons + let uploaded = timeline + .remote_client + .schedule_remove_gc_block_reason(reason)?; + + (remaining_blocks, uploaded) + }; + uploaded.await?; + + // no need to synchronize with gc iteration again + + if remaining_blocks > 0 { + tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked"); + } else { + tracing::info!("gc is now unblocked for the tenant"); + } + + Ok(()) + } + + pub(crate) fn before_delete(&self, timeline: &super::Timeline) { + let unblocked = { + let mut g = self.reasons.lock().unwrap(); + if g.is_empty() { + return; + } + + g.remove(&timeline.timeline_id); + + BlockingReasons::clean_and_summarize(g).is_none() + }; + + if unblocked { + tracing::info!("gc is now unblocked following deletion"); + } + } + + /// Initialize with the non-deleted timelines of this tenant. + pub(crate) fn set_scanned(&self, scanned: Storage) { + let mut g = self.reasons.lock().unwrap(); + assert!(g.is_empty()); + g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty())); + + if let Some(reasons) = BlockingReasons::clean_and_summarize(g) { + tracing::info!(summary=?reasons, "initialized with gc blocked"); + } + } +} + +pub(super) struct Guard<'a> { + _inner: tokio::sync::MutexGuard<'a, ()>, +} + +#[derive(Debug)] +pub(crate) struct BlockingReasons { + timelines: usize, + reasons: enumset::EnumSet, +} + +impl std::fmt::Display for BlockingReasons { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{} timelines block for {:?}", + self.timelines, self.reasons + ) + } +} + +impl BlockingReasons { + fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option { + let mut reasons = enumset::EnumSet::empty(); + g.retain(|_key, value| { + reasons = reasons.union(*value); + !value.is_empty() + }); + if !g.is_empty() { + Some(BlockingReasons { + timelines: g.len(), + reasons, + }) + } else { + None + } + } + + fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option { + if g.is_empty() { + None + } else { + let reasons = g + .values() + .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next)); + Some(BlockingReasons { + timelines: g.len(), + reasons, + }) + } + } +} diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 72167d02ab..844f117ea2 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -51,7 +51,8 @@ use crate::keyspace::KeyPartitioning; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use anyhow::Result; -use pageserver_api::keyspace::KeySpaceAccum; +use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; +use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; use std::collections::{HashMap, VecDeque}; use std::iter::Peekable; use std::ops::Range; @@ -61,7 +62,7 @@ use utils::lsn::Lsn; use historic_layer_coverage::BufferedHistoricLayerCoverage; pub use historic_layer_coverage::LayerKey; -use super::storage_layer::PersistentLayerDesc; +use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; /// /// LayerMap tracks what layers exist on a timeline. @@ -845,8 +846,8 @@ impl LayerMap { } /// Return all L0 delta layers - pub fn get_level0_deltas(&self) -> Result>> { - Ok(self.l0_delta_layers.to_vec()) + pub fn level0_deltas(&self) -> &Vec> { + &self.l0_delta_layers } /// debugging function to print out the contents of the layer map @@ -871,11 +872,183 @@ impl LayerMap { println!("End dump LayerMap"); Ok(()) } + + /// `read_points` represent the tip of a timeline and any branch points, i.e. the places + /// where we expect to serve reads. + /// + /// This function is O(N) and should be called infrequently. The caller is responsible for + /// looking up and updating the Layer objects for these layer descriptors. + pub fn get_visibility( + &self, + mut read_points: Vec, + ) -> ( + Vec<(Arc, LayerVisibilityHint)>, + KeySpace, + ) { + // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas + // KeySpace is intended to be composed statically and iterated over. + struct KeyShadow { + // Map of range start to range end + inner: RangeSetBlaze, + } + + impl KeyShadow { + fn new() -> Self { + Self { + inner: Default::default(), + } + } + + fn contains(&self, range: Range) -> bool { + let range_incl = range.start.to_i128()..=range.end.to_i128() - 1; + self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint( + CheckSortedDisjoint::from([range_incl]), + )) + } + + /// Add the input range to the keys covered by self. + /// + /// Return true if inserting this range covered some keys that were previously not covered + fn cover(&mut self, insert: Range) -> bool { + let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1; + self.inner.ranges_insert(range_incl) + } + + fn reset(&mut self) { + self.inner = Default::default(); + } + + fn to_keyspace(&self) -> KeySpace { + let mut accum = KeySpaceAccum::new(); + for range_incl in self.inner.ranges() { + let range = Range { + start: Key::from_i128(*range_incl.start()), + end: Key::from_i128(range_incl.end() + 1), + }; + accum.add_range(range) + } + + accum.to_keyspace() + } + } + + // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow, + // and a ReadPoint + read_points.sort_by_key(|rp| rp.0); + let mut shadow = KeyShadow::new(); + + // We will interleave all our read points and layers into a sorted collection + enum Item { + ReadPoint { lsn: Lsn }, + Layer(Arc), + } + + let mut items = Vec::with_capacity(self.historic.len() + read_points.len()); + items.extend(self.iter_historic_layers().map(Item::Layer)); + items.extend( + read_points + .into_iter() + .map(|rp| Item::ReadPoint { lsn: rp }), + ); + + // Ordering: we want to iterate like this: + // 1. Highest LSNs first + // 2. Consider images before deltas if they end at the same LSNs (images cover deltas) + // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible) + items.sort_by_key(|item| { + std::cmp::Reverse(match item { + Item::Layer(layer) => { + if layer.is_delta() { + (Lsn(layer.get_lsn_range().end.0 - 1), 0) + } else { + (layer.image_layer_lsn(), 1) + } + } + Item::ReadPoint { lsn } => (*lsn, 2), + }) + }); + + let mut results = Vec::with_capacity(self.historic.len()); + + let mut maybe_covered_deltas: Vec> = Vec::new(); + + for item in items { + let (reached_lsn, is_readpoint) = match &item { + Item::ReadPoint { lsn } => (lsn, true), + Item::Layer(layer) => (&layer.lsn_range.start, false), + }; + maybe_covered_deltas.retain(|d| { + if *reached_lsn >= d.lsn_range.start && is_readpoint { + // We encountered a readpoint within the delta layer: it is visible + + results.push((d.clone(), LayerVisibilityHint::Visible)); + false + } else if *reached_lsn < d.lsn_range.start { + // We passed the layer's range without encountering a read point: it is not visible + results.push((d.clone(), LayerVisibilityHint::Covered)); + false + } else { + // We're still in the delta layer: continue iterating + true + } + }); + + match item { + Item::ReadPoint { lsn: _lsn } => { + // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have + // to assume that the whole key range is visible at the branch point. + shadow.reset(); + } + Item::Layer(layer) => { + let visibility = if layer.is_delta() { + if shadow.contains(layer.get_key_range()) { + // If a layer isn't visible based on current state, we must defer deciding whether + // it is truly not visible until we have advanced past the delta's range: we might + // encounter another branch point within this delta layer's LSN range. + maybe_covered_deltas.push(layer); + continue; + } else { + LayerVisibilityHint::Visible + } + } else { + let modified = shadow.cover(layer.get_key_range()); + if modified { + // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered + LayerVisibilityHint::Visible + } else { + // An image layer in a region that was already covered + LayerVisibilityHint::Covered + } + }; + + results.push((layer, visibility)); + } + } + } + + // Drain any remaining maybe_covered deltas + results.extend( + maybe_covered_deltas + .into_iter() + .map(|d| (d, LayerVisibilityHint::Covered)), + ); + + (results, shadow.to_keyspace()) + } } #[cfg(test)] mod tests { - use pageserver_api::keyspace::KeySpace; + use crate::tenant::{storage_layer::LayerName, IndexPart}; + use pageserver_api::{ + key::DBDIR_KEY, + keyspace::{KeySpace, KeySpaceRandomAccum}, + }; + use std::{collections::HashMap, path::PathBuf}; + use utils::{ + id::{TenantId, TimelineId}, + shard::TenantShardId, + }; use super::*; @@ -1002,4 +1175,299 @@ mod tests { } } } + + #[test] + fn layer_visibility_basic() { + // A simple synthetic input, as a smoke test. + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); + let timeline_id = TimelineId::generate(); + let mut layer_map = LayerMap::default(); + let mut updates = layer_map.batch_update(); + + const FAKE_LAYER_SIZE: u64 = 1024; + + let inject_delta = |updates: &mut BatchedUpdates, + key_start: i128, + key_end: i128, + lsn_start: u64, + lsn_end: u64| { + let desc = PersistentLayerDesc::new_delta( + tenant_shard_id, + timeline_id, + Range { + start: Key::from_i128(key_start), + end: Key::from_i128(key_end), + }, + Range { + start: Lsn(lsn_start), + end: Lsn(lsn_end), + }, + 1024, + ); + updates.insert_historic(desc.clone()); + desc + }; + + let inject_image = + |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| { + let desc = PersistentLayerDesc::new_img( + tenant_shard_id, + timeline_id, + Range { + start: Key::from_i128(key_start), + end: Key::from_i128(key_end), + }, + Lsn(lsn), + FAKE_LAYER_SIZE, + ); + updates.insert_historic(desc.clone()); + desc + }; + + // + // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios + // we expect to handle. You can follow these examples through in the same order as they would be processed + // by the function under test. + // + + let mut read_points = vec![Lsn(1000)]; + + // A delta ahead of any image layer + let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110); + + // An image layer is visible and covers some layers beneath itself + let visible_covering_img = inject_image(&mut updates, 5, 25, 99); + + // A delta layer covered by the image layer: should be covered + let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100); + + // A delta layer partially covered by an image layer: should be visible + let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100); + + // A delta layer not covered by an image layer: should be visible + let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100); + + // An image layer covered by the image layer above: should be covered + let covered_image = inject_image(&mut updates, 10, 20, 89); + + // An image layer partially covered by an image layer: should be visible + let partially_covered_image = inject_image(&mut updates, 1, 7, 89); + + // An image layer not covered by an image layer: should be visible + let not_covered_image = inject_image(&mut updates, 1, 4, 89); + + // A read point: this will make subsequent layers below here visible, even if there are + // more recent layers covering them. + read_points.push(Lsn(80)); + + // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer + let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79); + + // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range: + // the read point should make it visible, even though its end LSN is covered + let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69); + let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69); + read_points.push(Lsn(65)); + let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69); + + let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65); + + updates.flush(); + + let (layer_visibilities, shadow) = layer_map.get_visibility(read_points); + let layer_visibilities = layer_visibilities.into_iter().collect::>(); + + assert_eq!( + layer_visibilities.get(&ahead_layer), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&visible_covering_img), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_delta), + Some(&LayerVisibilityHint::Covered) + ); + assert_eq!( + layer_visibilities.get(&partially_covered_delta), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(¬_covered_delta), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_image), + Some(&LayerVisibilityHint::Covered) + ); + assert_eq!( + layer_visibilities.get(&partially_covered_image), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(¬_covered_image), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_delta_below_read_point), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covering_img_between_read_points), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_delta_between_read_points), + Some(&LayerVisibilityHint::Covered) + ); + assert_eq!( + layer_visibilities.get(&covered_delta_intersects_read_point), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&visible_img_after_last_read_point), + Some(&LayerVisibilityHint::Visible) + ); + + // Shadow should include all the images below the last read point + let expected_shadow = KeySpace { + ranges: vec![Key::from_i128(10)..Key::from_i128(20)], + }; + assert_eq!(shadow, expected_shadow); + } + + fn fixture_path(relative: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) + } + + #[test] + fn layer_visibility_realistic() { + // Load a large example layermap + let index_raw = std::fs::read_to_string(fixture_path( + "test_data/indices/mixed_workload/index_part.json", + )) + .unwrap(); + let index: IndexPart = serde_json::from_str::(&index_raw).unwrap(); + + let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let timeline_id = TimelineId::generate(); + + let mut layer_map = LayerMap::default(); + let mut updates = layer_map.batch_update(); + for (layer_name, layer_metadata) in index.layer_metadata { + let layer_desc = match layer_name { + LayerName::Image(layer_name) => PersistentLayerDesc { + key_range: layer_name.key_range.clone(), + lsn_range: layer_name.lsn_as_range(), + tenant_shard_id, + timeline_id, + is_delta: false, + file_size: layer_metadata.file_size, + }, + LayerName::Delta(layer_name) => PersistentLayerDesc { + key_range: layer_name.key_range, + lsn_range: layer_name.lsn_range, + tenant_shard_id, + timeline_id, + is_delta: true, + file_size: layer_metadata.file_size, + }, + }; + updates.insert_historic(layer_desc); + } + updates.flush(); + + let read_points = vec![index.metadata.disk_consistent_lsn()]; + let (layer_visibilities, shadow) = layer_map.get_visibility(read_points); + for (layer_desc, visibility) in &layer_visibilities { + tracing::info!("{layer_desc:?}: {visibility:?}"); + eprintln!("{layer_desc:?}: {visibility:?}"); + } + + // The shadow should be non-empty, since there were some image layers + assert!(!shadow.ranges.is_empty()); + + // At least some layers should be marked covered + assert!(layer_visibilities + .iter() + .any(|i| matches!(i.1, LayerVisibilityHint::Covered))); + + let layer_visibilities = layer_visibilities.into_iter().collect::>(); + + // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it + for (layer_desc, visible) in &layer_visibilities { + let mut coverage = KeySpaceRandomAccum::new(); + let mut covered_by = Vec::new(); + + for other_layer in layer_map.iter_historic_layers() { + if &other_layer == layer_desc { + continue; + } + if !other_layer.is_delta() + && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1) + && other_layer.key_range.start <= layer_desc.key_range.end + && layer_desc.key_range.start <= other_layer.key_range.end + { + coverage.add_range(other_layer.get_key_range()); + covered_by.push((*other_layer).clone()); + } + } + let coverage = coverage.to_keyspace(); + + let expect_visible = if coverage.ranges.len() == 1 + && coverage.contains(&layer_desc.key_range.start) + && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1)) + { + LayerVisibilityHint::Covered + } else { + LayerVisibilityHint::Visible + }; + + if expect_visible != *visible { + eprintln!( + "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}", + layer_desc.key_range.start, + layer_desc.key_range.end, + layer_desc.lsn_range.start, + layer_desc.lsn_range.end, + layer_desc.is_delta() + ); + if expect_visible == LayerVisibilityHint::Covered { + eprintln!("Covered by:"); + for other in covered_by { + eprintln!( + " {}..{} @ {}", + other.get_key_range().start, + other.get_key_range().end, + other.image_layer_lsn() + ); + } + if let Some(range) = coverage.ranges.first() { + eprintln!( + "Total coverage from contributing layers: {}..{}", + range.start, range.end + ); + } else { + eprintln!( + "Total coverage from contributing layers: {:?}", + coverage.ranges + ); + } + } + } + assert_eq!(expect_visible, *visible); + } + + // Sanity: the layer that holds latest data for the DBDIR key should always be visible + // (just using this key as a key that will always exist for any layermap fixture) + let dbdir_layer = layer_map + .search(DBDIR_KEY, index.metadata.disk_consistent_lsn()) + .unwrap(); + assert!(matches!( + layer_visibilities.get(&dbdir_layer.layer).unwrap(), + LayerVisibilityHint::Visible + )); + } } diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index 347490c1ba..136f68bc36 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -521,6 +521,10 @@ impl BufferedHistoricLayerCoverage { Ok(&self.historic_coverage) } + + pub(crate) fn len(&self) -> usize { + self.layers.len() + } } #[test] diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 6ba1bdef9b..bbc070a81b 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -111,7 +111,7 @@ impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader { #[error("re-serializing for crc32 failed")] struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError); -const METADATA_HDR_SIZE: usize = std::mem::size_of::(); +const METADATA_HDR_SIZE: usize = size_of::(); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct TimelineMetadataBodyV2 { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index f23e6ff9d6..3f592f167e 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; use std::borrow::Cow; use std::cmp::Ordering; -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::Deref; use std::sync::Arc; use std::time::Duration; @@ -55,7 +55,7 @@ use utils::id::{TenantId, TimelineId}; use super::remote_timeline_client::remote_tenant_path; use super::secondary::SecondaryTenant; use super::timeline::detach_ancestor::PreparedTimelineDetach; -use super::TenantSharedResources; +use super::{GlobalShutDown, TenantSharedResources}; /// For a tenant that appears in TenantsMap, it may either be /// - `Attached`: has a full Tenant object, is elegible to service @@ -116,8 +116,6 @@ pub(crate) enum ShardSelector { /// Only return the 0th shard, if it is present. If a non-0th shard is present, /// ignore it. Zero, - /// Pick the first shard we find for the TenantId - First, /// Pick the shard that holds this key Page(Key), /// The shard ID is known: pick the given shard @@ -667,17 +665,20 @@ pub async fn init_tenant_mgr( let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let shard_identity = location_conf.shard; let slot = match location_conf.mode { - LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn( - conf, - tenant_shard_id, - &tenant_dir_path, - resources.clone(), - AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), - shard_identity, - Some(init_order.clone()), - SpawnMode::Lazy, - &ctx, - )), + LocationMode::Attached(attached_conf) => TenantSlot::Attached( + tenant_spawn( + conf, + tenant_shard_id, + &tenant_dir_path, + resources.clone(), + AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), + shard_identity, + Some(init_order.clone()), + SpawnMode::Lazy, + &ctx, + ) + .expect("global shutdown during init_tenant_mgr cannot happen"), + ), LocationMode::Secondary(secondary_conf) => { info!( tenant_id = %tenant_shard_id.tenant_id, @@ -725,7 +726,7 @@ fn tenant_spawn( init_order: Option, mode: SpawnMode, ctx: &RequestContext, -) -> Arc { +) -> Result, GlobalShutDown> { // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed // path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode // to avoid impacting prod runtime performance. @@ -1192,7 +1193,10 @@ impl TenantManager { None, spawn_mode, ctx, - ); + ) + .map_err(|_: GlobalShutDown| { + UpsertLocationError::Unavailable(TenantMapError::ShuttingDown) + })?; TenantSlot::Attached(tenant) } @@ -1313,7 +1317,7 @@ impl TenantManager { None, SpawnMode::Eager, ctx, - ); + )?; slot_guard.upsert(TenantSlot::Attached(tenant))?; @@ -1384,33 +1388,32 @@ impl TenantManager { tenant_shard_id: TenantShardId, ) -> Result<(), DeleteTenantError> { let remote_path = remote_tenant_path(&tenant_shard_id); - let keys = match self - .resources - .remote_storage - .list( - Some(&remote_path), - remote_storage::ListingMode::NoDelimiter, - None, - &self.cancel, - ) - .await - { - Ok(listing) => listing.keys, - Err(remote_storage::DownloadError::Cancelled) => { - return Err(DeleteTenantError::Cancelled) - } - Err(remote_storage::DownloadError::NotFound) => return Ok(()), - Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))), - }; + let mut keys_stream = self.resources.remote_storage.list_streaming( + Some(&remote_path), + remote_storage::ListingMode::NoDelimiter, + None, + &self.cancel, + ); + while let Some(chunk) = keys_stream.next().await { + let keys = match chunk { + Ok(listing) => listing.keys, + Err(remote_storage::DownloadError::Cancelled) => { + return Err(DeleteTenantError::Cancelled) + } + Err(remote_storage::DownloadError::NotFound) => return Ok(()), + Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))), + }; - if keys.is_empty() { - tracing::info!("Remote storage already deleted"); - } else { - tracing::info!("Deleting {} keys from remote storage", keys.len()); - self.resources - .remote_storage - .delete_objects(&keys, &self.cancel) - .await?; + if keys.is_empty() { + tracing::info!("Remote storage already deleted"); + } else { + tracing::info!("Deleting {} keys from remote storage", keys.len()); + let keys = keys.into_iter().map(|o| o.key).collect::>(); + self.resources + .remote_storage + .delete_objects(&keys, &self.cancel) + .await?; + } } Ok(()) @@ -1764,14 +1767,9 @@ impl TenantManager { let parent_timelines = timelines.keys().cloned().collect::>(); for timeline in timelines.values() { tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink"); - let timeline_layers = timeline - .layers - .read() - .await - .likely_resident_layers() - .collect::>(); + let layers = timeline.layers.read().await; - for layer in timeline_layers { + for layer in layers.likely_resident_layers() { let relative_path = layer .local_path() .strip_prefix(&parent_path) @@ -1968,7 +1966,8 @@ impl TenantManager { timeline_id: TimelineId, prepared: PreparedTimelineDetach, ctx: &RequestContext, - ) -> Result, anyhow::Error> { + ) -> Result, anyhow::Error> { + // FIXME: this is unnecessary, slotguard already has these semantics struct RevertOnDropSlot(Option); impl Drop for RevertOnDropSlot { @@ -2048,7 +2047,7 @@ impl TenantManager { None, SpawnMode::Eager, ctx, - ); + )?; slot_guard.upsert(TenantSlot::Attached(tenant))?; @@ -2089,7 +2088,6 @@ impl TenantManager { }; match selector { - ShardSelector::First => return ShardResolveResult::Found(tenant.clone()), ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { return ShardResolveResult::Found(tenant.clone()) } @@ -2171,6 +2169,9 @@ pub(crate) enum GetActiveTenantError { /// never happen. #[error("Tenant is broken: {0}")] Broken(String), + + #[error("reconnect to switch tenant id")] + SwitchedTenant, } #[derive(Debug, thiserror::Error)] diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index c75d1eaa5e..1344fe4192 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -187,7 +187,7 @@ use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; pub(crate) use download::download_initdb_tar_zst; -use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState}; use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -287,6 +287,14 @@ pub enum PersistIndexPartWithDeletedFlagError { Other(#[from] anyhow::Error), } +#[derive(Debug, thiserror::Error)] +pub enum WaitCompletionError { + #[error(transparent)] + NotInitialized(NotInitialized), + #[error("wait_completion aborted because upload queue was stopped")] + UploadQueueShutDownOrStopped, +} + /// A client for accessing a timeline's data in remote storage. /// /// This takes care of managing the number of connections, and balancing them @@ -449,6 +457,17 @@ impl RemoteTimelineClient { .unwrap_or(false) } + /// Returns whether the timeline is archived. + /// Return None if the remote index_part hasn't been downloaded yet. + pub(crate) fn is_archived(&self) -> Option { + self.upload_queue + .lock() + .unwrap() + .initialized_mut() + .map(|q| q.clean.0.archived_at.is_some()) + .ok() + } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { current_remote_index_part @@ -609,7 +628,7 @@ impl RemoteTimelineClient { Ok(()) } - /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated. + /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated. pub(crate) fn schedule_index_upload_for_aux_file_policy_update( self: &Arc, last_aux_file_policy: Option, @@ -620,6 +639,48 @@ impl RemoteTimelineClient { self.schedule_index_upload(upload_queue)?; Ok(()) } + + /// Launch an index-file upload operation in the background, with only the `archived_at` field updated. + /// + /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded, + /// so either if the change is already sitting in the queue, but not commited yet, or the change has not + /// been in the queue yet. + pub(crate) fn schedule_index_upload_for_timeline_archival_state( + self: &Arc, + state: TimelineArchivalState, + ) -> anyhow::Result { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + /// Returns Some(_) if a change is needed, and Some(true) if it's a + /// change needed to set archived_at. + fn need_change( + archived_at: &Option, + state: TimelineArchivalState, + ) -> Option { + match (archived_at, state) { + (Some(_), TimelineArchivalState::Archived) + | (None, TimelineArchivalState::Unarchived) => { + // Nothing to do + tracing::info!("intended state matches present state"); + None + } + (None, TimelineArchivalState::Archived) => Some(true), + (Some(_), TimelineArchivalState::Unarchived) => Some(false), + } + } + let need_upload_scheduled = need_change(&upload_queue.dirty.archived_at, state); + + if let Some(archived_at_set) = need_upload_scheduled { + let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc()); + upload_queue.dirty.archived_at = intended_archived_at; + self.schedule_index_upload(upload_queue)?; + } + + let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some(); + Ok(need_wait) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -630,7 +691,7 @@ impl RemoteTimelineClient { /// /// Like schedule_index_upload_for_metadata_update(), this merely adds /// the upload to the upload queue and returns quickly. - pub fn schedule_index_upload_for_file_changes(self: &Arc) -> anyhow::Result<()> { + pub fn schedule_index_upload_for_file_changes(self: &Arc) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -645,7 +706,7 @@ impl RemoteTimelineClient { fn schedule_index_upload( self: &Arc, upload_queue: &mut UploadQueueInitialized, - ) -> anyhow::Result<()> { + ) -> Result<(), NotInitialized> { let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); // fix up the duplicated field upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn; @@ -653,7 +714,7 @@ impl RemoteTimelineClient { // make sure it serializes before doing it in perform_upload_task so that it doesn't // look like a retryable error let void = std::io::sink(); - serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?; + serde_json::to_writer(void, &upload_queue.dirty).expect("serialize index_part.json"); let index_part = &upload_queue.dirty; @@ -699,7 +760,9 @@ impl RemoteTimelineClient { self.schedule_barrier0(upload_queue) }; - Self::wait_completion0(receiver).await + Self::wait_completion0(receiver) + .await + .context("wait completion") } /// Schedules uploading a new version of `index_part.json` with the given layers added, @@ -732,7 +795,126 @@ impl RemoteTimelineClient { barrier }; - Self::wait_completion0(barrier).await + Self::wait_completion0(barrier) + .await + .context("wait completion") + } + + /// Adds a gc blocking reason for this timeline if one does not exist already. + /// + /// A retryable step of timeline detach ancestor. + /// + /// Returns a future which waits until the completion of the upload. + pub(crate) fn schedule_insert_gc_block_reason( + self: &Arc, + reason: index::GcBlockingReason, + ) -> Result>, NotInitialized> + { + let maybe_barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if let index::GcBlockingReason::DetachAncestor = reason { + if upload_queue.dirty.metadata.ancestor_timeline().is_none() { + drop(guard); + panic!("cannot start detach ancestor if there is nothing to detach from"); + } + } + + let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason)); + + let current = upload_queue.dirty.gc_blocking.as_ref(); + let uploaded = upload_queue.clean.0.gc_blocking.as_ref(); + + match (current, uploaded) { + (x, y) if wanted(x) && wanted(y) => None, + (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), + // Usual case: !wanted(x) && !wanted(y) + // + // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to + // turn on and off some reason. + (x, y) => { + if !wanted(x) && wanted(y) { + // this could be avoided by having external in-memory synchronization, like + // timeline detach ancestor + warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason"); + } + + // at this point, the metadata must always show that there is a parent + upload_queue.dirty.gc_blocking = current + .map(|x| x.with_reason(reason)) + .or_else(|| Some(index::GcBlocking::started_now_for(reason))); + self.schedule_index_upload(upload_queue)?; + Some(self.schedule_barrier0(upload_queue)) + } + } + }; + + Ok(async move { + if let Some(barrier) = maybe_barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) + }) + } + + /// Removes a gc blocking reason for this timeline if one exists. + /// + /// A retryable step of timeline detach ancestor. + /// + /// Returns a future which waits until the completion of the upload. + pub(crate) fn schedule_remove_gc_block_reason( + self: &Arc, + reason: index::GcBlockingReason, + ) -> Result>, NotInitialized> + { + let maybe_barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if let index::GcBlockingReason::DetachAncestor = reason { + if !upload_queue + .clean + .0 + .lineage + .is_detached_from_original_ancestor() + { + drop(guard); + panic!("cannot complete timeline_ancestor_detach while not detached"); + } + } + + let wanted = |x: Option<&index::GcBlocking>| { + x.is_none() || x.is_some_and(|b| !b.blocked_by(reason)) + }; + + let current = upload_queue.dirty.gc_blocking.as_ref(); + let uploaded = upload_queue.clean.0.gc_blocking.as_ref(); + + match (current, uploaded) { + (x, y) if wanted(x) && wanted(y) => None, + (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), + (x, y) => { + if !wanted(x) && wanted(y) { + warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)"); + } + + upload_queue.dirty.gc_blocking = + current.as_ref().and_then(|x| x.without_reason(reason)); + assert!(wanted(upload_queue.dirty.gc_blocking.as_ref())); + // FIXME: bogus ? + self.schedule_index_upload(upload_queue)?; + Some(self.schedule_barrier0(upload_queue)) + } + } + }; + + Ok(async move { + if let Some(barrier) = maybe_barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) + }) } /// Launch an upload operation in the background; the file is added to be included in next @@ -740,7 +922,7 @@ impl RemoteTimelineClient { pub(crate) fn schedule_layer_file_upload( self: &Arc, layer: ResidentLayer, - ) -> anyhow::Result<()> { + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -826,7 +1008,7 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> anyhow::Result> + ) -> Result, NotInitialized> where I: IntoIterator, { @@ -952,7 +1134,7 @@ impl RemoteTimelineClient { self: &Arc, compacted_from: &[Layer], compacted_to: &[ResidentLayer], - ) -> anyhow::Result<()> { + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -969,10 +1151,12 @@ impl RemoteTimelineClient { } /// Wait for all previously scheduled uploads/deletions to complete - pub(crate) async fn wait_completion(self: &Arc) -> anyhow::Result<()> { + pub(crate) async fn wait_completion(self: &Arc) -> Result<(), WaitCompletionError> { let receiver = { let mut guard = self.upload_queue.lock().unwrap(); - let upload_queue = guard.initialized_mut()?; + let upload_queue = guard + .initialized_mut() + .map_err(WaitCompletionError::NotInitialized)?; self.schedule_barrier0(upload_queue) }; @@ -981,9 +1165,9 @@ impl RemoteTimelineClient { async fn wait_completion0( mut receiver: tokio::sync::watch::Receiver<()>, - ) -> anyhow::Result<()> { + ) -> Result<(), WaitCompletionError> { if receiver.changed().await.is_err() { - anyhow::bail!("wait_completion aborted because upload queue was stopped"); + return Err(WaitCompletionError::UploadQueueShutDownOrStopped); } Ok(()) @@ -1311,6 +1495,18 @@ impl RemoteTimelineClient { .dirty .layer_metadata .drain() + .filter(|(_file_name, meta)| { + // Filter out layers that belonged to an ancestor shard. Since we are deleting the whole timeline from + // all shards anyway, we _could_ delete these, but + // - it creates a potential race if other shards are still + // using the layers while this shard deletes them. + // - it means that if we rolled back the shard split, the ancestor shards would be in a state where + // these timelines are present but corrupt (their index exists but some layers don't) + // + // These layers will eventually be cleaned up by the scrubber when it does physical GC. + meta.shard.shard_number == self.tenant_shard_id.shard_number + && meta.shard.shard_count == self.tenant_shard_id.shard_count + }) .map(|(file_name, meta)| { remote_layer_path( &self.tenant_shard_id.tenant_id, @@ -1366,12 +1562,13 @@ impl RemoteTimelineClient { // marker via its deleted_at attribute let latest_index = remaining .iter() - .filter(|p| { - p.object_name() + .filter(|o| { + o.key + .object_name() .map(|n| n.starts_with(IndexPart::FILE_NAME)) .unwrap_or(false) }) - .filter_map(|path| parse_remote_index_path(path.clone()).map(|gen| (path, gen))) + .filter_map(|o| parse_remote_index_path(o.key.clone()).map(|gen| (o.key.clone(), gen))) .max_by_key(|i| i.1) .map(|i| i.0.clone()) .unwrap_or( @@ -1382,14 +1579,12 @@ impl RemoteTimelineClient { let remaining_layers: Vec = remaining .into_iter() - .filter(|p| { - if p == &latest_index { - return false; + .filter_map(|o| { + if o.key == latest_index || o.key.object_name() == Some(INITDB_PRESERVED_PATH) { + None + } else { + Some(o.key) } - if p.object_name() == Some(INITDB_PRESERVED_PATH) { - return false; - } - true }) .inspect(|path| { if let Some(name) = path.object_name() { diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d0385e4aee..a17b32c983 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -295,10 +295,11 @@ where }; } - for key in listing.keys { - let object_name = key + for object in listing.keys { + let object_name = object + .key .object_name() - .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?; + .ok_or_else(|| anyhow::anyhow!("object name for key {}", object.key))?; other_prefixes.insert(object_name.to_string()); } @@ -459,7 +460,7 @@ pub(crate) async fn download_index_part( // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md let max_previous_generation = indices .into_iter() - .filter_map(parse_remote_index_path) + .filter_map(|o| parse_remote_index_path(o.key)) .filter(|g| g <= &my_generation) .max(); diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index b439df8edb..90453b1922 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -32,6 +32,10 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none")] pub deleted_at: Option, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub archived_at: Option, + /// Per layer file name metadata, which can be present for a present or missing layer file. /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata @@ -56,6 +60,9 @@ pub struct IndexPart { #[serde(default)] pub(crate) lineage: Lineage, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) gc_blocking: Option, + /// Describes the kind of aux files stored in the timeline. /// /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable. @@ -80,10 +87,12 @@ impl IndexPart { /// - 5: lineage was added /// - 6: last_aux_file_policy is added. /// - 7: metadata_bytes is no longer written, but still read - const LATEST_VERSION: usize = 7; + /// - 8: added `archived_at` + /// - 9: +gc_blocking + const LATEST_VERSION: usize = 9; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -94,7 +103,9 @@ impl IndexPart { disk_consistent_lsn: metadata.disk_consistent_lsn(), metadata, deleted_at: None, + archived_at: None, lineage: Default::default(), + gc_blocking: None, last_aux_file_policy: None, } } @@ -245,6 +256,64 @@ impl Lineage { } } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) struct GcBlocking { + pub(crate) started_at: NaiveDateTime, + pub(crate) reasons: enumset::EnumSet, +} + +#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)] +#[enumset(serialize_repr = "list")] +pub(crate) enum GcBlockingReason { + Manual, + DetachAncestor, +} + +impl GcBlocking { + pub(super) fn started_now_for(reason: GcBlockingReason) -> Self { + GcBlocking { + started_at: chrono::Utc::now().naive_utc(), + reasons: enumset::EnumSet::only(reason), + } + } + + /// Returns true if the given reason is one of the reasons why the gc is blocked. + pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool { + self.reasons.contains(reason) + } + + /// Returns a version of self with the given reason. + pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self { + assert!(!self.blocked_by(reason)); + let mut reasons = self.reasons; + reasons.insert(reason); + + Self { + started_at: self.started_at, + reasons, + } + } + + /// Returns a version of self without the given reason. Assumption is that if + /// there are no more reasons, we can unblock the gc by returning `None`. + pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option { + assert!(self.blocked_by(reason)); + + if self.reasons.len() == 1 { + None + } else { + let mut reasons = self.reasons; + assert!(reasons.remove(reason)); + assert!(!reasons.is_empty()); + + Some(Self { + started_at: self.started_at, + reasons, + }) + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -284,7 +353,9 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -326,7 +397,9 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -369,7 +442,9 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -415,7 +490,9 @@ mod tests { ]) .unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -456,7 +533,9 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -496,11 +575,13 @@ mod tests { disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(), metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + archived_at: None, lineage: Lineage { reparenting_history_truncated: false, reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), }, + gc_blocking: None, last_aux_file_policy: None, }; @@ -545,11 +626,13 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, lineage: Lineage { reparenting_history_truncated: false, reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), }, + gc_blocking: None, last_aux_file_policy: Some(AuxFilePolicy::V2), }; @@ -603,7 +686,9 @@ mod tests { 14, ).with_recalculated_checksum().unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, lineage: Default::default(), + gc_blocking: None, last_aux_file_policy: Default::default(), }; @@ -611,6 +696,125 @@ mod tests { assert_eq!(part, expected); } + #[test] + fn v8_indexpart_is_parsed() { + let example = r#"{ + "version": 8, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "deleted_at": "2023-07-31T09:00:00.123", + "archived_at": "2023-04-29T09:00:00.123" + }"#; + + let expected = IndexPart { + version: 8, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")), + lineage: Default::default(), + gc_blocking: None, + last_aux_file_policy: Default::default(), + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v9_indexpart_is_parsed() { + let example = r#"{ + "version": 9, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + } + }"#; + + let expected = IndexPart { + version: 9, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + fn parse_naive_datetime(s: &str) -> NaiveDateTime { chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index f931341aca..04f89db401 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -8,6 +8,9 @@ mod layer_desc; mod layer_name; pub mod merge_iterator; +#[cfg(test)] +pub mod split_writer; + use crate::context::{AccessStatsBehavior, RequestContext}; use crate::repository::Value; use crate::walrecord::NeonWalRecord; @@ -432,39 +435,18 @@ impl ReadableLayer { } } -/// Return value from [`Layer::get_value_reconstruct_data`] -#[derive(Clone, Copy, Debug)] -pub enum ValueReconstructResult { - /// Got all the data needed to reconstruct the requested page - Complete, - /// This layer didn't contain all the required data, the caller should look up - /// the predecessor layer at the returned LSN and collect more data from there. - Continue, - - /// This layer didn't contain data needed to reconstruct the page version at - /// the returned LSN. This is usually considered an error, but might be OK - /// in some circumstances. - Missing, -} - /// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility /// of layers (for example when creating a branch that makes some previously covered layers visible). It should /// be used for cache management but not for correctness-critical checks. -#[derive(Default, Debug, Clone, PartialEq, Eq)] -pub(crate) enum LayerVisibilityHint { +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LayerVisibilityHint { /// A Visible layer might be read while serving a read, because there is not an image layer between it /// and a readable LSN (the tip of the branch or a child's branch point) Visible, /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates /// a branch or ephemeral endpoint at an LSN below the layer that covers this. - #[allow(unused)] Covered, - /// Calculating layer visibilty requires I/O, so until this has happened layers are loaded - /// in this state. Note that newly written layers may be called Visible immediately, this uninitialized - /// state is for when existing layers are constructed while loading a timeline. - #[default] - Uninitialized, } pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64); @@ -557,19 +539,25 @@ impl LayerAccessStats { self.record_residence_event_at(SystemTime::now()) } - pub(crate) fn record_access_at(&self, now: SystemTime) { + fn record_access_at(&self, now: SystemTime) -> bool { let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now); // A layer which is accessed must be visible. mask |= 0x1 << Self::VISIBILITY_SHIFT; value |= 0x1 << Self::VISIBILITY_SHIFT; - self.write_bits(mask, value); + let old_bits = self.write_bits(mask, value); + !matches!( + self.decode_visibility(old_bits), + LayerVisibilityHint::Visible + ) } - pub(crate) fn record_access(&self, ctx: &RequestContext) { + /// Returns true if we modified the layer's visibility to set it to Visible implicitly + /// as a result of this access + pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool { if ctx.access_stats_behavior() == AccessStatsBehavior::Skip { - return; + return false; } self.record_access_at(SystemTime::now()) @@ -626,23 +614,30 @@ impl LayerAccessStats { } } - pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) { - let value = match visibility { - LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT, - LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0, - }; - - self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value); - } - - pub(crate) fn visibility(&self) -> LayerVisibilityHint { - let read = self.0.load(std::sync::atomic::Ordering::Relaxed); - match (read >> Self::VISIBILITY_SHIFT) & 0x1 { + /// Helper for extracting the visibility hint from the literal value of our inner u64 + fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint { + match (bits >> Self::VISIBILITY_SHIFT) & 0x1 { 1 => LayerVisibilityHint::Visible, 0 => LayerVisibilityHint::Covered, _ => unreachable!(), } } + + /// Returns the old value which has been replaced + pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint { + let value = match visibility { + LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT, + LayerVisibilityHint::Covered => 0x0, + }; + + let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value); + self.decode_visibility(old_bits) + } + + pub(crate) fn visibility(&self) -> LayerVisibilityHint { + let read = self.0.load(std::sync::atomic::Ordering::Relaxed); + self.decode_visibility(read) + } } /// Get a layer descriptor from a layer. diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 586a7b7836..f4e965b99a 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -36,13 +36,12 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi use crate::tenant::disk_btree::{ DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, }; -use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; -use crate::tenant::{PageReconstructError, Timeline}; +use crate::tenant::PageReconstructError; use crate::virtual_file::{self, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; @@ -72,10 +71,7 @@ use utils::{ lsn::Lsn, }; -use super::{ - AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer, - ValuesReconstructState, -}; +use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; /// /// Header stored in the beginning of the file @@ -200,7 +196,6 @@ impl DeltaKey { pub struct DeltaLayer { path: Utf8PathBuf, pub desc: PersistentLayerDesc, - access_stats: LayerAccessStats, inner: OnceCell>, } @@ -299,7 +294,6 @@ impl DeltaLayer { /// not loaded already. /// async fn load(&self, ctx: &RequestContext) -> Result<&Arc> { - self.access_stats.record_access(ctx); // Quick exit if already loaded self.inner .get_or_try_init(|| self.load_inner(ctx)) @@ -307,12 +301,10 @@ impl DeltaLayer { .with_context(|| format!("Failed to load delta layer {}", self.path())) } - async fn load_inner(&self, ctx: &RequestContext) -> Result> { + async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result> { let path = self.path(); - let loaded = DeltaLayerInner::load(&path, None, None, ctx) - .await - .and_then(|res| res)?; + let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?; // not production code let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); @@ -352,7 +344,6 @@ impl DeltaLayer { summary.lsn_range, metadata.len(), ), - access_stats: Default::default(), inner: OnceCell::new(), }) } @@ -375,7 +366,6 @@ impl DeltaLayer { /// 3. Call `finish`. /// struct DeltaLayerWriterInner { - conf: &'static PageServerConf, pub path: Utf8PathBuf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -386,6 +376,9 @@ struct DeltaLayerWriterInner { tree: DiskBtreeBuilder, blob_writer: BlobWriter, + + // Number of key-lsns in the layer. + num_keys: usize, } impl DeltaLayerWriterInner { @@ -419,7 +412,6 @@ impl DeltaLayerWriterInner { let tree_builder = DiskBtreeBuilder::new(block_buf); Ok(Self { - conf, path, timeline_id, tenant_shard_id, @@ -427,6 +419,7 @@ impl DeltaLayerWriterInner { lsn_range, tree: tree_builder, blob_writer, + num_keys: 0, }) } @@ -469,7 +462,7 @@ impl DeltaLayerWriterInner { .write_blob_maybe_compressed(val, ctx, compression) .await; let off = match res { - Ok(off) => off, + Ok((off, _)) => off, Err(e) => return (val, Err(anyhow::anyhow!(e))), }; @@ -477,6 +470,9 @@ impl DeltaLayerWriterInner { let delta_key = DeltaKey::from_key_lsn(&key, lsn); let res = self.tree.append(&delta_key.0, blob_ref.0); + + self.num_keys += 1; + (val, res.map_err(|e| anyhow::anyhow!(e))) } @@ -490,11 +486,10 @@ impl DeltaLayerWriterInner { async fn finish( self, key_end: Key, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let temp_path = self.path.clone(); - let result = self.finish0(key_end, timeline, ctx).await; + let result = self.finish0(key_end, ctx).await; if result.is_err() { tracing::info!(%temp_path, "cleaning up temporary file after error during writing"); if let Err(e) = std::fs::remove_file(&temp_path) { @@ -507,9 +502,8 @@ impl DeltaLayerWriterInner { async fn finish0( self, key_end: Key, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -574,11 +568,9 @@ impl DeltaLayerWriterInner { // fsync the file file.sync_all().await?; - let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?; + trace!("created delta layer {}", self.path); - trace!("created delta layer {}", layer.local_path()); - - Ok(layer) + Ok((desc, self.path)) } } @@ -679,14 +671,20 @@ impl DeltaLayerWriter { pub(crate) async fn finish( mut self, key_end: Key, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { - self.inner - .take() - .unwrap() - .finish(key_end, timeline, ctx) - .await + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + self.inner.take().unwrap().finish(key_end, ctx).await + } + + #[cfg(test)] + pub(crate) fn num_keys(&self) -> usize { + self.inner.as_ref().unwrap().num_keys + } + + #[cfg(test)] + pub(crate) fn estimated_size(&self) -> u64 { + let inner = self.inner.as_ref().unwrap(); + inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64 } } @@ -760,27 +758,24 @@ impl DeltaLayerInner { &self.layer_lsn_range } - /// Returns nested result following Result, Critical>: - /// - inner has the success or transient failure - /// - outer has the permanent failure pub(super) async fn load( path: &Utf8Path, summary: Option, max_vectored_read_bytes: Option, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path, ctx).await { - Ok(file) => file, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), - }; + ) -> anyhow::Result { + let file = VirtualFile::open(path, ctx) + .await + .context("open layer file")?; + let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); - let summary_blk = match block_reader.read_blk(0, ctx).await { - Ok(blk) => blk, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), - }; + let summary_blk = block_reader + .read_blk(0, ctx) + .await + .context("read first block")?; // TODO: this should be an assertion instead; see ImageLayerInner::load let actual_summary = @@ -802,7 +797,7 @@ impl DeltaLayerInner { } } - Ok(Ok(DeltaLayerInner { + Ok(DeltaLayerInner { file, file_id, index_start_blk: actual_summary.index_start_blk, @@ -810,96 +805,7 @@ impl DeltaLayerInner { max_vectored_read_bytes, layer_key_range: actual_summary.key_range, layer_lsn_range: actual_summary.lsn_range, - })) - } - - pub(super) async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - let mut need_image = true; - // Scan the page versions backwards, starting from `lsn`. - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - self.index_start_blk, - self.index_root_blk, - &block_reader, - ); - let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); - - let mut offsets: Vec<(Lsn, u64)> = Vec::new(); - - tree_reader - .visit( - &search_key.0, - VisitDirection::Backwards, - |key, value| { - let blob_ref = BlobRef(value); - if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { - return false; - } - let entry_lsn = DeltaKey::extract_lsn_from_buf(key); - if entry_lsn < lsn_range.start { - return false; - } - offsets.push((entry_lsn, blob_ref.pos())); - - !blob_ref.will_init() - }, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::DeltaLayerBtreeNode) - .build(), - ) - .await?; - - let ctx = &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::DeltaLayerValue) - .build(); - - // Ok, 'offsets' now contains the offsets of all the entries we need to read - let cursor = block_reader.block_cursor(); - let mut buf = Vec::new(); - for (entry_lsn, pos) in offsets { - cursor - .read_blob_into_buf(pos, &mut buf, ctx) - .await - .with_context(|| { - format!("Failed to read blob from virtual file {}", self.file.path) - })?; - let val = Value::des(&buf).with_context(|| { - format!( - "Failed to deserialize file blob from virtual file {}", - self.file.path - ) - })?; - match val { - Value::Image(img) => { - reconstruct_state.img = Some((entry_lsn, img)); - need_image = false; - break; - } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } - } - } - - // If an older page image is needed to reconstruct the page, let the - // caller know. - if need_image { - Ok(ValueReconstructResult::Continue) - } else { - Ok(ValueReconstructResult::Complete) - } + }) } // Look up the keys in the provided keyspace and update @@ -1674,8 +1580,9 @@ pub(crate) mod test { use super::*; use crate::repository::Value; use crate::tenant::harness::TIMELINE_ID; + use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; - use crate::tenant::Tenant; + use crate::tenant::{Tenant, Timeline}; use crate::{ context::DownloadBehavior, task_mgr::TaskKind, @@ -1969,9 +1876,8 @@ pub(crate) mod test { res?; } - let resident = writer - .finish(entries_meta.key_range.end, &timeline, &ctx) - .await?; + let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?; + let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?; let inner = resident.get_as_delta(&ctx).await?; @@ -2051,6 +1957,7 @@ pub(crate) mod test { .await .likely_resident_layers() .next() + .cloned() .unwrap(); { @@ -2125,7 +2032,8 @@ pub(crate) mod test { .read() .await .likely_resident_layers() - .find(|x| x != &initdb_layer) + .find(|&x| x != &initdb_layer) + .cloned() .unwrap(); // create a copy for the timeline, so we don't overwrite the file @@ -2160,7 +2068,8 @@ pub(crate) mod test { .await .unwrap(); - let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap(); + let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap(); + let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap(); copied_layer.get_as_delta(ctx).await.unwrap(); @@ -2288,7 +2197,9 @@ pub(crate) mod test { for (key, lsn, value) in deltas { writer.put_value(key, lsn, value, ctx).await?; } - let delta_layer = writer.finish(key_end, tline, ctx).await?; + + let (desc, path) = writer.finish(key_end, ctx).await?; + let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?; Ok::<_, anyhow::Error>(delta_layer) } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index e5e7f71928..16ba0fda94 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -32,9 +32,6 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{ DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, }; -use crate::tenant::storage_layer::{ - LayerAccessStats, ValueReconstructResult, ValueReconstructState, -}; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, @@ -137,7 +134,6 @@ pub struct ImageLayer { pub desc: PersistentLayerDesc, // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn pub lsn: Lsn, - access_stats: LayerAccessStats, inner: OnceCell, } @@ -255,7 +251,6 @@ impl ImageLayer { /// not loaded already. /// async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> { - self.access_stats.record_access(ctx); self.inner .get_or_try_init(|| self.load_inner(ctx)) .await @@ -265,9 +260,8 @@ impl ImageLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); - let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx) - .await - .and_then(|res| res)?; + let loaded = + ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?; // not production code let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); @@ -307,7 +301,6 @@ impl ImageLayer { metadata.len(), ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: summary.lsn, - access_stats: Default::default(), inner: OnceCell::new(), }) } @@ -385,17 +378,16 @@ impl ImageLayerInner { summary: Option, max_vectored_read_bytes: Option, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path, ctx).await { - Ok(file) => file, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), - }; + ) -> anyhow::Result { + let file = VirtualFile::open(path, ctx) + .await + .context("open layer file")?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); - let summary_blk = match block_reader.read_blk(0, ctx).await { - Ok(blk) => blk, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), - }; + let summary_blk = block_reader + .read_blk(0, ctx) + .await + .context("read first block")?; // length is the only way how this could fail, so it's not actually likely at all unless // read_blk returns wrong sized block. @@ -420,7 +412,7 @@ impl ImageLayerInner { } } - Ok(Ok(ImageLayerInner { + Ok(ImageLayerInner { index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, lsn, @@ -428,47 +420,7 @@ impl ImageLayerInner { file_id, max_vectored_read_bytes, key_range: actual_summary.key_range, - })) - } - - pub(super) async fn get_value_reconstruct_data( - &self, - key: Key, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let tree_reader = - DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); - - let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; - key.write_to_byte_slice(&mut keybuf); - if let Some(offset) = tree_reader - .get( - &keybuf, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::ImageLayerBtreeNode) - .build(), - ) - .await? - { - let blob = block_reader - .block_cursor() - .read_blob( - offset, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::ImageLayerValue) - .build(), - ) - .await - .with_context(|| format!("failed to read value from offset {}", offset))?; - let value = Bytes::from(blob); - - reconstruct_state.img = Some((self.lsn, value)); - Ok(ValueReconstructResult::Complete) - } else { - Ok(ValueReconstructResult::Missing) - } + }) } // Look up the keys in the provided keyspace and update @@ -736,11 +688,29 @@ struct ImageLayerWriterInner { // Total uncompressed bytes passed into put_image uncompressed_bytes: u64, + // Like `uncompressed_bytes`, + // but only of images we might consider for compression + uncompressed_bytes_eligible: u64, + + // Like `uncompressed_bytes`, but only of images + // where we have chosen their compressed form + uncompressed_bytes_chosen: u64, + + // Number of keys in the layer. + num_keys: usize, + blob_writer: BlobWriter, tree: DiskBtreeBuilder, + + #[cfg_attr(not(feature = "testing"), allow(dead_code))] + last_written_key: Key, } impl ImageLayerWriterInner { + fn size(&self) -> u64 { + self.tree.borrow_writer().size() + self.blob_writer.size() + } + /// /// Start building a new image layer. /// @@ -792,6 +762,10 @@ impl ImageLayerWriterInner { tree: tree_builder, blob_writer, uncompressed_bytes: 0, + uncompressed_bytes_eligible: 0, + uncompressed_bytes_chosen: 0, + num_keys: 0, + last_written_key: Key::MIN, }; Ok(writer) @@ -810,18 +784,33 @@ impl ImageLayerWriterInner { ) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); let compression = self.conf.image_compression; - self.uncompressed_bytes += img.len() as u64; + let uncompressed_len = img.len() as u64; + self.uncompressed_bytes += uncompressed_len; + self.num_keys += 1; let (_img, res) = self .blob_writer .write_blob_maybe_compressed(img, ctx, compression) .await; // TODO: re-use the buffer for `img` further upstack - let off = res?; + let (off, compression_info) = res?; + if compression_info.compressed_size.is_some() { + // The image has been considered for compression at least + self.uncompressed_bytes_eligible += uncompressed_len; + } + if compression_info.written_compressed { + // The image has been compressed + self.uncompressed_bytes_chosen += uncompressed_len; + } let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); self.tree.append(&keybuf, off)?; + #[cfg(feature = "testing")] + { + self.last_written_key = key; + } + Ok(()) } @@ -832,6 +821,7 @@ impl ImageLayerWriterInner { self, timeline: &Arc, ctx: &RequestContext, + end_key: Option, ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -839,6 +829,9 @@ impl ImageLayerWriterInner { // Calculate compression ratio let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes); + crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED + .inc_by(self.uncompressed_bytes_eligible); + crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen); crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size); let mut file = self.blob_writer.into_inner(); @@ -879,11 +872,23 @@ impl ImageLayerWriterInner { let desc = PersistentLayerDesc::new_img( self.tenant_shard_id, self.timeline_id, - self.key_range.clone(), + if let Some(end_key) = end_key { + self.key_range.start..end_key + } else { + self.key_range.clone() + }, self.lsn, metadata.len(), ); + #[cfg(feature = "testing")] + if let Some(end_key) = end_key { + assert!( + self.last_written_key < end_key, + "written key violates end_key range" + ); + } + // Note: Because we open the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. @@ -960,6 +965,18 @@ impl ImageLayerWriter { self.inner.as_mut().unwrap().put_image(key, img, ctx).await } + #[cfg(test)] + /// Estimated size of the image layer. + pub(crate) fn estimated_size(&self) -> u64 { + let inner = self.inner.as_ref().unwrap(); + inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64 + } + + #[cfg(test)] + pub(crate) fn num_keys(&self) -> usize { + self.inner.as_ref().unwrap().num_keys + } + /// /// Finish writing the image layer. /// @@ -968,7 +985,26 @@ impl ImageLayerWriter { timeline: &Arc, ctx: &RequestContext, ) -> anyhow::Result { - self.inner.take().unwrap().finish(timeline, ctx).await + self.inner.take().unwrap().finish(timeline, ctx, None).await + } + + #[cfg(test)] + /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive. + pub(super) async fn finish_with_end_key( + mut self, + timeline: &Arc, + end_key: Key, + ctx: &RequestContext, + ) -> anyhow::Result { + self.inner + .take() + .unwrap() + .finish(timeline, ctx, Some(end_key)) + .await + } + + pub(crate) fn size(&self) -> u64 { + self.inner.as_ref().unwrap().size() } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index f9010ae8a6..57d93feaaf 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -10,11 +10,11 @@ use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::ephemeral_file::EphemeralFile; -use crate::tenant::storage_layer::ValueReconstructResult; use crate::tenant::timeline::GetVectoredError; -use crate::tenant::{PageReconstructError, Timeline}; +use crate::tenant::PageReconstructError; use crate::{l0_flush, page_cache, walrecord}; -use anyhow::{anyhow, ensure, Result}; +use anyhow::{anyhow, Result}; +use camino::Utf8PathBuf; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; @@ -34,8 +34,7 @@ use std::sync::atomic::{AtomicU64, AtomicUsize}; use tokio::sync::{RwLock, RwLockWriteGuard}; use super::{ - DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState, - ValuesReconstructState, + DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState, }; #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] @@ -55,9 +54,6 @@ pub struct InMemoryLayer { /// Writes are only allowed when this is `None`. pub(crate) end_lsn: OnceLock, - /// Used for traversal path. Cached representation of the in-memory layer before frozen. - local_path_str: Arc, - /// Used for traversal path. Cached representation of the in-memory layer after frozen. frozen_local_path_str: OnceLock>, @@ -248,12 +244,6 @@ impl InMemoryLayer { self.start_lsn..self.end_lsn_or_max() } - pub(crate) fn local_path_str(&self) -> &Arc { - self.frozen_local_path_str - .get() - .unwrap_or(&self.local_path_str) - } - /// debugging function to print out the contents of the layer /// /// this is likely completly unused @@ -303,60 +293,6 @@ impl InMemoryLayer { Ok(()) } - /// Look up given value in the layer. - pub(crate) async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - ensure!(lsn_range.start >= self.start_lsn); - let mut need_image = true; - - let ctx = RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::InMemoryLayer) - .build(); - - let inner = self.inner.read().await; - - let reader = inner.file.block_cursor(); - - // Scan the page versions backwards, starting from `lsn`. - if let Some(vec_map) = inner.index.get(&key) { - let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, pos) in slice.iter().rev() { - let buf = reader.read_blob(*pos, &ctx).await?; - let value = Value::des(&buf)?; - match value { - Value::Image(img) => { - reconstruct_state.img = Some((*entry_lsn, img)); - return Ok(ValueReconstructResult::Complete); - } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((*entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } - } - } - } - - // release lock on 'inner' - - // If an older page image is needed to reconstruct the page, let the - // caller know. - if need_image { - Ok(ValueReconstructResult::Continue) - } else { - Ok(ValueReconstructResult::Complete) - } - } - // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. // @@ -449,20 +385,17 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, + gate_guard: utils::sync::gate::GateGuard, ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?; + let file = + EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { file_id: key, - local_path_str: { - let mut buf = String::new(); - inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap(); - buf.into() - }, frozen_local_path_str: OnceLock::new(), conf, timeline_id, @@ -482,8 +415,7 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - - pub(crate) async fn put_value( + pub async fn put_value( &self, key: Key, lsn: Lsn, @@ -548,8 +480,6 @@ impl InMemoryLayer { /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive pub async fn freeze(&self, end_lsn: Lsn) { - let inner = self.inner.write().await; - assert!( self.start_lsn < end_lsn, "{} >= {}", @@ -567,9 +497,13 @@ impl InMemoryLayer { }) .expect("frozen_local_path_str set only once"); - for vec_map in inner.index.values() { - for (lsn, _pos) in vec_map.as_slice() { - assert!(*lsn < end_lsn); + #[cfg(debug_assertions)] + { + let inner = self.inner.write().await; + for vec_map in inner.index.values() { + for (lsn, _pos) in vec_map.as_slice() { + assert!(*lsn < end_lsn); + } } } } @@ -579,12 +513,12 @@ impl InMemoryLayer { /// if there are no matching keys. /// /// Returns a new delta layer with all the same data as this in-memory layer - pub(crate) async fn write_to_disk( + pub async fn write_to_disk( &self, - timeline: &Arc, ctx: &RequestContext, key_range: Option>, - ) -> Result> { + l0_flush_global_state: &l0_flush::Inner, + ) -> Result> { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception @@ -596,9 +530,8 @@ impl InMemoryLayer { // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().await; - let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone(); use l0_flush::Inner; - let _concurrency_permit = match &*l0_flush_global_state { + let _concurrency_permit = match l0_flush_global_state { Inner::PageCached => None, Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await), }; @@ -628,7 +561,7 @@ impl InMemoryLayer { ) .await?; - match &*l0_flush_global_state { + match l0_flush_global_state { l0_flush::Inner::PageCached => { let ctx = RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::InMemoryLayer) @@ -693,7 +626,7 @@ impl InMemoryLayer { } // MAX is used here because we identify L0 layers by full key range - let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?; + let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?; // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``. // @@ -705,6 +638,6 @@ impl InMemoryLayer { // we dirtied when writing to the filesystem have been flushed and marked !dirty. drop(_concurrency_permit); - Ok(Some(delta_layer)) + Ok(Some((desc, path))) } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 1db3e7c675..83450d24bb 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -17,14 +17,14 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::repository::Key; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::task_mgr::TaskKind; -use crate::tenant::timeline::GetVectoredError; +use crate::tenant::timeline::{CompactionError, GetVectoredError}; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; use super::image_layer::{self}; use super::{ AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, - PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, + LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState, }; use utils::generation::Generation; @@ -246,7 +246,7 @@ impl Layer { &timeline.generation, ); - let layer = LayerInner::new( + LayerInner::new( conf, timeline, local_path, @@ -254,14 +254,7 @@ impl Layer { Some(inner), timeline.generation, timeline.get_shard_index(), - ); - - // Newly created layers are marked visible by default: the usual case is that they were created to be read. - layer - .access_stats - .set_visibility(super::LayerVisibilityHint::Visible); - - layer + ) })); let downloaded = resident.expect("just initialized"); @@ -307,42 +300,6 @@ impl Layer { self.0.delete_on_drop(); } - /// Return data needed to reconstruct given page at LSN. - /// - /// It is up to the caller to collect more data from the previous layer and - /// perform WAL redo, if necessary. - /// - /// # Cancellation-Safety - /// - /// This method is cancellation-safe. - pub(crate) async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_data: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - use anyhow::ensure; - - let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?; - self.0.access_stats.record_access(ctx); - - if self.layer_desc().is_delta { - ensure!(lsn_range.start >= self.layer_desc().lsn_range.start); - ensure!(self.layer_desc().key_range.contains(&key)); - } else { - ensure!(self.layer_desc().key_range.contains(&key)); - ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn()); - ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn()); - } - - layer - .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx) - .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self)) - .await - .with_context(|| format!("get_value_reconstruct_data for layer {self}")) - } - pub(crate) async fn get_values_reconstruct_data( &self, keyspace: KeySpace, @@ -359,7 +316,7 @@ impl Layer { other => GetVectoredError::Other(anyhow::anyhow!(other)), })?; - self.0.access_stats.record_access(ctx); + self.record_access(ctx); layer .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) @@ -426,7 +383,7 @@ impl Layer { } /// Downloads if necessary and creates a guard, which will keep this layer from being evicted. - pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result { + pub(crate) async fn download_and_keep_resident(&self) -> Result { let downloaded = self.0.get_or_maybe_download(true, None).await?; Ok(ResidentLayer { @@ -439,18 +396,18 @@ impl Layer { self.0.info(reset) } - pub(crate) fn access_stats(&self) -> &LayerAccessStats { - &self.0.access_stats + pub(crate) fn latest_activity(&self) -> SystemTime { + self.0.access_stats.latest_activity() + } + + pub(crate) fn visibility(&self) -> LayerVisibilityHint { + self.0.access_stats.visibility() } pub(crate) fn local_path(&self) -> &Utf8Path { &self.0.path } - pub(crate) fn debug_str(&self) -> &Arc { - &self.0.debug_str - } - pub(crate) fn metadata(&self) -> LayerFileMetadata { self.0.metadata() } @@ -493,13 +450,57 @@ impl Layer { } } } + + fn record_access(&self, ctx: &RequestContext) { + if self.0.access_stats.record_access(ctx) { + // Visibility was modified to Visible + tracing::info!( + "Layer {} became visible as a result of access", + self.0.desc.key() + ); + if let Some(tl) = self.0.timeline.upgrade() { + tl.metrics + .visible_physical_size_gauge + .add(self.0.desc.file_size) + } + } + } + + pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) { + let old_visibility = self.0.access_stats.set_visibility(visibility.clone()); + use LayerVisibilityHint::*; + match (old_visibility, visibility) { + (Visible, Covered) => { + // Subtract this layer's contribution to the visible size metric + if let Some(tl) = self.0.timeline.upgrade() { + debug_assert!( + tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size + ); + tl.metrics + .visible_physical_size_gauge + .sub(self.0.desc.file_size) + } + } + (Covered, Visible) => { + // Add this layer's contribution to the visible size metric + if let Some(tl) = self.0.timeline.upgrade() { + tl.metrics + .visible_physical_size_gauge + .add(self.0.desc.file_size) + } + } + (Covered, Covered) | (Visible, Visible) => { + // no change + } + } + } } /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted. /// /// However when we want something evicted, we cannot evict it right away as there might be current /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet -/// read with [`Layer::get_value_reconstruct_data`]. +/// read with [`Layer::get_values_reconstruct_data`]. /// /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search #[derive(Debug)] @@ -580,9 +581,6 @@ struct LayerInner { /// Full path to the file; unclear if this should exist anymore. path: Utf8PathBuf, - /// String representation of the layer, used for traversal id. - debug_str: Arc, - desc: PersistentLayerDesc, /// Timeline access is needed for remote timeline client and metrics. @@ -693,6 +691,16 @@ impl Drop for LayerInner { timeline.metrics.layer_count_image.dec(); timeline.metrics.layer_size_image.sub(self.desc.file_size); } + + if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) { + debug_assert!( + timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size + ); + timeline + .metrics + .visible_physical_size_gauge + .sub(self.desc.file_size); + } } if !*self.wanted_deleted.get_mut() { @@ -801,11 +809,14 @@ impl LayerInner { timeline.metrics.layer_size_image.add(desc.file_size); } + // New layers are visible by default. This metric is later updated on drop or in set_visibility + timeline + .metrics + .visible_physical_size_gauge + .add(desc.file_size); + LayerInner { conf, - debug_str: { - format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into() - }, path: local_path, desc, timeline: Arc::downgrade(timeline), @@ -1651,8 +1662,9 @@ impl Drop for DownloadedLayer { } impl DownloadedLayer { - /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to - /// initialize it permanently. + /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`]. + /// Failure to load the layer is sticky, i.e., future `get()` calls will return + /// the initial load failure immediately. /// /// `owner` parameter is a strong reference at the same `LayerInner` as the /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called, @@ -1683,7 +1695,7 @@ impl DownloadedLayer { ctx, ) .await - .map(|res| res.map(LayerKind::Delta)) + .map(LayerKind::Delta) } else { let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( @@ -1700,54 +1712,29 @@ impl DownloadedLayer { ctx, ) .await - .map(|res| res.map(LayerKind::Image)) + .map(LayerKind::Image) }; match res { - Ok(Ok(layer)) => Ok(Ok(layer)), - Ok(Err(transient)) => Err(transient), - Err(permanent) => { + Ok(layer) => Ok(layer), + Err(err) => { LAYER_IMPL_METRICS.inc_permanent_loading_failures(); - // TODO(#5815): we are not logging all errors, so temporarily log them **once** - // here as well - let permanent = permanent.context("load layer"); - tracing::error!("layer loading failed permanently: {permanent:#}"); - Ok(Err(permanent)) + // We log this message once over the lifetime of `Self` + // => Ok and good to log backtrace and path here. + tracing::error!( + "layer load failed, assuming permanent failure: {}: {err:?}", + owner.path + ); + Err(err) } } }; self.kind - .get_or_try_init(init) - // return transient errors using `?` - .await? + .get_or_init(init) + .await .as_ref() - .map_err(|e| { - // errors are not clonabled, cannot but stringify - // test_broken_timeline matches this string - anyhow::anyhow!("layer loading failed: {e:#}") - }) - } - - async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_data: &mut ValueReconstructState, - owner: &Arc, - ctx: &RequestContext, - ) -> anyhow::Result { - use LayerKind::*; - - match self.get(owner, ctx).await? { - Delta(d) => { - d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx) - .await - } - Image(i) => { - i.get_value_reconstruct_data(key, reconstruct_data, ctx) - .await - } - } + // We already logged the full backtrace above, once. Don't repeat that here. + .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}")) } async fn get_values_reconstruct_data( @@ -1760,7 +1747,11 @@ impl DownloadedLayer { ) -> Result<(), GetVectoredError> { use LayerKind::*; - match self.get(owner, ctx).await.map_err(GetVectoredError::from)? { + match self + .get(owner, ctx) + .await + .map_err(GetVectoredError::Other)? + { Delta(d) => { d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx) .await @@ -1844,7 +1835,7 @@ impl ResidentLayer { // this is valid because the DownloadedLayer::kind is a OnceCell, not a // Mutex, so we cannot go and deinitialize the value with OnceCell::take // while it's being held. - owner.access_stats.record_access(ctx); + self.owner.record_access(ctx); delta_layer::DeltaLayerInner::load_keys(d, ctx) .await @@ -1862,12 +1853,24 @@ impl ResidentLayer { shard_identity: &ShardIdentity, writer: &mut ImageLayerWriter, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { use LayerKind::*; - match self.downloaded.get(&self.owner.0, ctx).await? { - Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")), - Image(i) => i.filter(shard_identity, writer, ctx).await, + match self + .downloaded + .get(&self.owner.0, ctx) + .await + .map_err(CompactionError::Other)? + { + Delta(_) => { + return Err(CompactionError::Other(anyhow::anyhow!(format!( + "cannot filter() on a delta layer {self}" + )))); + } + Image(i) => i + .filter(shard_identity, writer, ctx) + .await + .map_err(CompactionError::Other), } } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index d5d2f748a9..bffd2db800 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -39,7 +39,7 @@ async fn smoke_test() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -50,13 +50,26 @@ async fn smoke_test() { // all layers created at pageserver are like `layer`, initialized with strong // Arc. + let controlfile_keyspace = KeySpace { + ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()], + }; + let img_before = { - let mut data = ValueReconstructState::default(); + let mut data = ValuesReconstructState::default(); layer - .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .get_values_reconstruct_data( + controlfile_keyspace.clone(), + Lsn(0x10)..Lsn(0x11), + &mut data, + &ctx, + ) .await .unwrap(); - data.img + data.keys + .remove(&CONTROLFILE_KEY) + .expect("must be present") + .expect("should not error") + .img .take() .expect("tenant harness writes the control file") }; @@ -74,13 +87,24 @@ async fn smoke_test() { // on accesses when the layer is evicted, it will automatically be downloaded. let img_after = { - let mut data = ValueReconstructState::default(); + let mut data = ValuesReconstructState::default(); layer - .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .get_values_reconstruct_data( + controlfile_keyspace.clone(), + Lsn(0x10)..Lsn(0x11), + &mut data, + &ctx, + ) .instrument(download_span.clone()) .await .unwrap(); - data.img.take().unwrap() + data.keys + .remove(&CONTROLFILE_KEY) + .expect("must be present") + .expect("should not error") + .img + .take() + .expect("tenant harness writes the control file") }; assert_eq!(img_before, img_after); @@ -152,7 +176,7 @@ async fn smoke_test() { { let layers = &[layer]; let mut g = timeline.layers.write().await; - g.finish_gc_timeline(layers); + g.open_mut().unwrap().finish_gc_timeline(layers); // this just updates the remote_physical_size for demonstration purposes rtc.schedule_gc_update(layers).unwrap(); } @@ -192,7 +216,7 @@ async fn evict_and_wait_on_wanted_deleted() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -236,7 +260,7 @@ async fn evict_and_wait_on_wanted_deleted() { // the deletion of the layer in remote_storage happens. { let mut layers = timeline.layers.write().await; - layers.finish_gc_timeline(&[layer]); + layers.open_mut().unwrap().finish_gc_timeline(&[layer]); } SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; @@ -277,7 +301,7 @@ fn read_wins_pending_eviction() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -409,7 +433,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -578,7 +602,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -658,7 +682,7 @@ async fn evict_and_wait_does_not_wait_for_download() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -777,9 +801,9 @@ async fn eviction_cancellation_on_drop() { let (evicted_layer, not_evicted) = { let mut layers = { let mut guard = timeline.layers.write().await; - let layers = guard.likely_resident_layers().collect::>(); + let layers = guard.likely_resident_layers().cloned().collect::>(); // remove the layers from layermap - guard.finish_gc_timeline(&layers); + guard.open_mut().unwrap().finish_gc_timeline(&layers); layers }; @@ -828,9 +852,9 @@ async fn eviction_cancellation_on_drop() { #[test] #[cfg(target_arch = "x86_64")] fn layer_size() { - assert_eq!(std::mem::size_of::(), 8); - assert_eq!(std::mem::size_of::(), 104); - assert_eq!(std::mem::size_of::(), 312); + assert_eq!(size_of::(), 8); + assert_eq!(size_of::(), 104); + assert_eq!(size_of::(), 296); // it also has the utf8 path } diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index bd765560e4..cbd18e650f 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -41,6 +41,20 @@ pub struct PersistentLayerKey { pub is_delta: bool, } +impl std::fmt::Display for PersistentLayerKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}..{} {}..{} is_delta={}", + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end, + self.is_delta + ) + } +} + impl PersistentLayerDesc { pub fn key(&self) -> PersistentLayerKey { PersistentLayerKey { diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index eb4a1f28a1..b4bd976033 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -204,9 +204,11 @@ impl<'a> IteratorWrapper<'a> { /// A merge iterator over delta/image layer iterators. When duplicated records are /// found, the iterator will not perform any deduplication, and the caller should handle /// these situation. By saying duplicated records, there are many possibilities: +/// /// * Two same delta at the same LSN. /// * Two same image at the same LSN. /// * Delta/image at the same LSN where the image has already applied the delta. +/// /// The iterator will always put the image before the delta. pub struct MergeIterator<'a> { heap: BinaryHeap>, diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs new file mode 100644 index 0000000000..d7bfe48c60 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -0,0 +1,454 @@ +use std::{ops::Range, sync::Arc}; + +use bytes::Bytes; +use pageserver_api::key::{Key, KEY_SIZE}; +use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId}; + +use crate::tenant::storage_layer::Layer; +use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline}; + +use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer}; + +/// An image writer that takes images and produces multiple image layers. The interface does not +/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files +/// to be cleaned up) +#[must_use] +pub struct SplitImageLayerWriter { + inner: ImageLayerWriter, + target_layer_size: u64, + generated_layers: Vec, + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + lsn: Lsn, +} + +impl SplitImageLayerWriter { + pub async fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + start_key: Key, + lsn: Lsn, + target_layer_size: u64, + ctx: &RequestContext, + ) -> anyhow::Result { + Ok(Self { + target_layer_size, + inner: ImageLayerWriter::new( + conf, + timeline_id, + tenant_shard_id, + &(start_key..Key::MAX), + lsn, + ctx, + ) + .await?, + generated_layers: Vec::new(), + conf, + timeline_id, + tenant_shard_id, + lsn, + }) + } + + pub async fn put_image( + &mut self, + key: Key, + img: Bytes, + tline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + // The current estimation is an upper bound of the space that the key/image could take + // because we did not consider compression in this estimation. The resulting image layer + // could be smaller than the target size. + let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64; + if self.inner.num_keys() >= 1 + && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size + { + let next_image_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(key..Key::MAX), + self.lsn, + ctx, + ) + .await?; + let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); + self.generated_layers.push( + prev_image_writer + .finish_with_end_key(tline, key, ctx) + .await?, + ); + } + self.inner.put_image(key, img, ctx).await + } + + pub(crate) async fn finish( + self, + tline: &Arc, + ctx: &RequestContext, + end_key: Key, + ) -> anyhow::Result> { + let Self { + mut generated_layers, + inner, + .. + } = self; + generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?); + Ok(generated_layers) + } + + /// When split writer fails, the caller should call this function and handle partially generated layers. + #[allow(dead_code)] + pub(crate) async fn take(self) -> anyhow::Result<(Vec, ImageLayerWriter)> { + Ok((self.generated_layers, self.inner)) + } +} + +/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not +/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files +/// to be cleaned up). +#[must_use] +pub struct SplitDeltaLayerWriter { + inner: DeltaLayerWriter, + target_layer_size: u64, + generated_layers: Vec, + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + lsn_range: Range, +} + +impl SplitDeltaLayerWriter { + pub async fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + start_key: Key, + lsn_range: Range, + target_layer_size: u64, + ctx: &RequestContext, + ) -> anyhow::Result { + Ok(Self { + target_layer_size, + inner: DeltaLayerWriter::new( + conf, + timeline_id, + tenant_shard_id, + start_key, + lsn_range.clone(), + ctx, + ) + .await?, + generated_layers: Vec::new(), + conf, + timeline_id, + tenant_shard_id, + lsn_range, + }) + } + + pub async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + tline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate + // number, and therefore the final layer size could be a little bit larger or smaller than the target. + let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */; + if self.inner.num_keys() >= 1 + && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size + { + let next_delta_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + self.lsn_range.clone(), + ctx, + ) + .await?; + let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer); + let (desc, path) = prev_delta_writer.finish(key, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + self.generated_layers.push(delta_layer); + } + self.inner.put_value(key, lsn, val, ctx).await + } + + pub(crate) async fn finish( + self, + tline: &Arc, + ctx: &RequestContext, + end_key: Key, + ) -> anyhow::Result> { + let Self { + mut generated_layers, + inner, + .. + } = self; + + let (desc, path) = inner.finish(end_key, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + generated_layers.push(delta_layer); + Ok(generated_layers) + } + + /// When split writer fails, the caller should call this function and handle partially generated layers. + #[allow(dead_code)] + pub(crate) async fn take(self) -> anyhow::Result<(Vec, DeltaLayerWriter)> { + Ok((self.generated_layers, self.inner)) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + tenant::{ + harness::{TenantHarness, TIMELINE_ID}, + storage_layer::AsLayerDesc, + }, + DEFAULT_PG_VERSION, + }; + + use super::*; + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + fn get_img(id: u32) -> Bytes { + format!("{id:064}").into() + } + + fn get_large_img() -> Bytes { + vec![0; 8192].into() + } + + #[tokio::test] + async fn write_one_image() { + let harness = TenantHarness::create("split_writer_write_one_image") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let mut image_writer = SplitImageLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + + image_writer + .put_image(get_key(0), get_img(0), &tline, &ctx) + .await + .unwrap(); + let layers = image_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 1); + + delta_writer + .put_value( + get_key(0), + Lsn(0x18), + Value::Image(get_img(0)), + &tline, + &ctx, + ) + .await + .unwrap(); + let layers = delta_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 1); + } + + #[tokio::test] + async fn write_split() { + let harness = TenantHarness::create("split_writer_write_split") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let mut image_writer = SplitImageLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + const N: usize = 2000; + for i in 0..N { + let i = i as u32; + image_writer + .put_image(get_key(i), get_large_img(), &tline, &ctx) + .await + .unwrap(); + delta_writer + .put_value( + get_key(i), + Lsn(0x20), + Value::Image(get_large_img()), + &tline, + &ctx, + ) + .await + .unwrap(); + } + let image_layers = image_writer + .finish(&tline, &ctx, get_key(N as u32)) + .await + .unwrap(); + let delta_layers = delta_writer + .finish(&tline, &ctx, get_key(N as u32)) + .await + .unwrap(); + assert_eq!(image_layers.len(), N / 512 + 1); + assert_eq!(delta_layers.len(), N / 512 + 1); + for idx in 0..image_layers.len() { + assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN); + assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX); + assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN); + assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX); + if idx > 0 { + assert_eq!( + image_layers[idx - 1].layer_desc().key_range.end, + image_layers[idx].layer_desc().key_range.start + ); + assert_eq!( + delta_layers[idx - 1].layer_desc().key_range.end, + delta_layers[idx].layer_desc().key_range.start + ); + } + } + } + + #[tokio::test] + async fn write_large_img() { + let harness = TenantHarness::create("split_writer_write_large_img") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let mut image_writer = SplitImageLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18), + 4 * 1024, + &ctx, + ) + .await + .unwrap(); + + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024, + &ctx, + ) + .await + .unwrap(); + + image_writer + .put_image(get_key(0), get_img(0), &tline, &ctx) + .await + .unwrap(); + image_writer + .put_image(get_key(1), get_large_img(), &tline, &ctx) + .await + .unwrap(); + let layers = image_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 2); + + delta_writer + .put_value( + get_key(0), + Lsn(0x18), + Value::Image(get_img(0)), + &tline, + &ctx, + ) + .await + .unwrap(); + delta_writer + .put_value( + get_key(1), + Lsn(0x1A), + Value::Image(get_large_img()), + &tline, + &ctx, + ) + .await + .unwrap(); + let layers = delta_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 2); + } +} diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 7f59e54eb7..b4706ea59d 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -210,24 +210,28 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { Duration::from_secs(10) } else { // Run compaction - if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - log_compaction_error( - &e, - error_run_count, - &wait_duration, - cancel.is_cancelled(), - ); - wait_duration - } else { - error_run_count = 0; - period + match tenant.compaction_iteration(&cancel, &ctx).await { + Err(e) => { + let wait_duration = backoff::exponential_backoff_duration_seconds( + error_run_count + 1, + 1.0, + MAX_BACKOFF_SECS, + ); + error_run_count += 1; + let wait_duration = Duration::from_secs_f64(wait_duration); + log_compaction_error( + &e, + error_run_count, + &wait_duration, + cancel.is_cancelled(), + ); + wait_duration + } + Ok(has_pending_task) => { + error_run_count = 0; + // schedule the next compaction immediately in case there is a pending compaction task + if has_pending_task { Duration::from_secs(0) } else { period } + } } }; @@ -403,9 +407,16 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { error_run_count += 1; let wait_duration = Duration::from_secs_f64(wait_duration); - error!( - "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}", - ); + if matches!(e, crate::tenant::GcError::TimelineCancelled) { + // Timeline was cancelled during gc. We might either be in an event + // that affects the entire tenant (tenant deletion, pageserver shutdown), + // or in one that affects the timeline only (timeline deletion). + // Therefore, don't exit the loop. + info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); + } else { + error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); + } + wait_duration } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 178b707aa7..76dcb5645f 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3,6 +3,7 @@ pub(crate) mod compaction; pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; +pub(crate) mod handle; mod init; pub mod layer_manager; pub(crate) mod logical_size; @@ -17,11 +18,12 @@ use camino::Utf8Path; use chrono::{DateTime, Utc}; use enumset::EnumSet; use fail::fail_point; +use handle::ShardTimelineId; use once_cell::sync::Lazy; use pageserver_api::{ key::{ - AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, - NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, + KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, + NON_INHERITED_SPARSE_RANGE, }, keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ @@ -57,10 +59,7 @@ use std::{ collections::{BTreeMap, HashMap, HashSet}, sync::atomic::AtomicU64, }; -use std::{ - cmp::{max, min, Ordering}, - ops::ControlFlow, -}; +use std::{cmp::min, ops::ControlFlow}; use std::{ collections::btree_map::Entry, ops::{Deref, Range}, @@ -74,6 +73,7 @@ use crate::{ metadata::TimelineMetadata, storage_layer::PersistentLayerDesc, }, + walredo, }; use crate::{ context::{DownloadBehavior, RequestContext}, @@ -84,8 +84,8 @@ use crate::{ disk_usage_eviction_task::finite_f32, tenant::storage_layer::{ AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult, - ValueReconstructState, ValuesReconstructState, + LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState, + ValuesReconstructState, }, }; use crate::{ @@ -137,10 +137,13 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::config::TenantConf; +use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; -use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer}; +use super::{ + remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError, + storage_layer::ReadableLayer, +}; use super::{ secondary::heatmap::{HeatMapLayer, HeatMapTimeline}, GcError, @@ -177,25 +180,6 @@ impl std::fmt::Display for ImageLayerCreationMode { } } -/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap -#[derive(Debug, Clone, PartialEq, Eq)] -pub(crate) struct Hole { - key_range: Range, - coverage_size: usize, -} - -impl Ord for Hole { - fn cmp(&self, other: &Self) -> Ordering { - other.coverage_size.cmp(&self.coverage_size) // inverse order - } -} - -impl PartialOrd for Hole { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. /// Can be removed after all refactors are done. fn drop_rlock(rlock: tokio::sync::RwLockReadGuard) { @@ -443,6 +427,8 @@ pub struct Timeline { pub(crate) extra_test_dense_keyspace: ArcSwap, pub(crate) l0_flush_global_state: L0FlushGlobalState, + + pub(crate) handles: handle::PerTimelineState, } pub struct WalReceiverInfo { @@ -541,6 +527,12 @@ pub(crate) enum PageReconstructError { MissingKey(MissingKeyError), } +impl From for PageReconstructError { + fn from(_: layer_manager::Shutdown) -> Self { + PageReconstructError::Cancelled + } +} + impl GetVectoredError { #[cfg(test)] pub(crate) fn is_missing_key_error(&self) -> bool { @@ -548,17 +540,27 @@ impl GetVectoredError { } } -#[derive(Debug)] +impl From for GetVectoredError { + fn from(_: layer_manager::Shutdown) -> Self { + GetVectoredError::Cancelled + } +} + pub struct MissingKeyError { key: Key, shard: ShardNumber, cont_lsn: Lsn, request_lsn: Lsn, ancestor_lsn: Option, - traversal_path: Vec, backtrace: Option, } +impl std::fmt::Debug for MissingKeyError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self) + } +} + impl std::fmt::Display for MissingKeyError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -570,18 +572,6 @@ impl std::fmt::Display for MissingKeyError { write!(f, ", ancestor {}", ancestor_lsn)?; } - if !self.traversal_path.is_empty() { - writeln!(f)?; - } - - for (r, c, l) in &self.traversal_path { - writeln!( - f, - "layer traversal: result {:?}, cont_lsn {}, layer: {}", - r, c, l, - )?; - } - if let Some(ref backtrace) = self.backtrace { write!(f, "\n{}", backtrace)?; } @@ -619,6 +609,12 @@ pub(crate) enum CreateImageLayersError { Other(#[from] anyhow::Error), } +impl From for CreateImageLayersError { + fn from(_: layer_manager::Shutdown) -> Self { + CreateImageLayersError::Cancelled + } +} + #[derive(thiserror::Error, Debug, Clone)] pub(crate) enum FlushLayerError { /// Timeline cancellation token was cancelled @@ -642,7 +638,13 @@ impl FlushLayerError { // When crossing from generic anyhow errors to this error type, we explicitly check // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err. fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self { - if timeline.cancel.is_cancelled() { + let cancelled = timeline.cancel.is_cancelled() + // The upload queue might have been shut down before the official cancellation of the timeline. + || err + .downcast_ref::() + .map(NotInitialized::is_stopping) + .unwrap_or_default(); + if cancelled { Self::Cancelled } else { Self::Other(Arc::new(err)) @@ -650,6 +652,12 @@ impl FlushLayerError { } } +impl From for FlushLayerError { + fn from(_: layer_manager::Shutdown) -> Self { + FlushLayerError::Cancelled + } +} + #[derive(thiserror::Error, Debug)] pub(crate) enum GetVectoredError { #[error("timeline shutting down")] @@ -704,6 +712,7 @@ pub(crate) enum CompactFlags { ForceRepartition, ForceImageLayerCreation, EnhancedGcBottomMostCompaction, + DryRun, } impl std::fmt::Debug for Timeline { @@ -917,116 +926,44 @@ impl Timeline { self.timeline_get_throttle.throttle(ctx, 1).await; - match self.conf.get_impl { - GetImpl::Legacy => { - let reconstruct_state = ValueReconstructState { - records: Vec::new(), - img: None, - }; + let keyspace = KeySpace { + ranges: vec![key..key.next()], + }; - self.get_impl(key, lsn, reconstruct_state, ctx).await - } - GetImpl::Vectored => { - let keyspace = KeySpace { - ranges: vec![key..key.next()], - }; + // Initialise the reconstruct state for the key with the cache + // entry returned above. + let mut reconstruct_state = ValuesReconstructState::new(); - // Initialise the reconstruct state for the key with the cache - // entry returned above. - let mut reconstruct_state = ValuesReconstructState::new(); + let vectored_res = self + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .await; - let vectored_res = self - .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) - .await; - - if self.conf.validate_vectored_get { - self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) - .await; - } - - let key_value = vectored_res?.pop_first(); - match key_value { - Some((got_key, value)) => { - if got_key != key { - error!( - "Expected {}, but singular vectored get returned {}", - key, got_key - ); - Err(PageReconstructError::Other(anyhow!( - "Singular vectored get returned wrong key" - ))) - } else { - value - } - } - None => Err(PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn: Lsn(0), - request_lsn: lsn, - ancestor_lsn: None, - traversal_path: Vec::new(), - backtrace: None, - })), + let key_value = vectored_res?.pop_first(); + match key_value { + Some((got_key, value)) => { + if got_key != key { + error!( + "Expected {}, but singular vectored get returned {}", + key, got_key + ); + Err(PageReconstructError::Other(anyhow!( + "Singular vectored get returned wrong key" + ))) + } else { + value } } + None => Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn: Lsn(0), + request_lsn: lsn, + ancestor_lsn: None, + backtrace: None, + })), } } - /// Not subject to [`Self::timeline_get_throttle`]. - async fn get_impl( - &self, - key: Key, - lsn: Lsn, - mut reconstruct_state: ValueReconstructState, - ctx: &RequestContext, - ) -> Result { - // XXX: structured stats collection for layer eviction here. - trace!( - "get page request for {}@{} from task kind {:?}", - key, - lsn, - ctx.task_kind() - ); - - let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME - .for_get_kind(GetKind::Singular) - .start_timer(); - let path = self - .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) - .await?; - timer.stop_and_record(); - - let start = Instant::now(); - let res = self.reconstruct_value(key, lsn, reconstruct_state).await; - let elapsed = start.elapsed(); - crate::metrics::RECONSTRUCT_TIME - .for_get_kind(GetKind::Singular) - .observe(elapsed.as_secs_f64()); - - if cfg!(feature = "testing") && res.is_err() { - // it can only be walredo issue - use std::fmt::Write; - - let mut msg = String::new(); - - path.into_iter().for_each(|(res, cont_lsn, layer)| { - writeln!( - msg, - "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}", - layer, - ) - .expect("string grows") - }); - - // this is to rule out or provide evidence that we could in some cases read a duplicate - // walrecord - tracing::info!("walredo failed, path:\n{msg}"); - } - - res - } - pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0; @@ -1076,28 +1013,14 @@ impl Timeline { .throttle(ctx, key_count as usize) .await; - let res = match self.conf.get_vectored_impl { - GetVectoredImpl::Sequential => { - self.get_vectored_sequential_impl(keyspace, lsn, ctx).await - } - GetVectoredImpl::Vectored => { - let vectored_res = self - .get_vectored_impl( - keyspace.clone(), - lsn, - &mut ValuesReconstructState::new(), - ctx, - ) - .await; - - if self.conf.validate_vectored_get { - self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) - .await; - } - - vectored_res - } - }; + let res = self + .get_vectored_impl( + keyspace.clone(), + lsn, + &mut ValuesReconstructState::new(), + ctx, + ) + .await; if let Some((metric, start)) = start { let elapsed = start.elapsed(); @@ -1186,65 +1109,6 @@ impl Timeline { vectored_res } - /// Not subject to [`Self::timeline_get_throttle`]. - pub(super) async fn get_vectored_sequential_impl( - &self, - keyspace: KeySpace, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result>, GetVectoredError> { - let mut values = BTreeMap::new(); - - for range in keyspace.ranges { - let mut key = range.start; - while key != range.end { - let block = self - .get_impl(key, lsn, ValueReconstructState::default(), ctx) - .await; - - use PageReconstructError::*; - match block { - Err(Cancelled) => return Err(GetVectoredError::Cancelled), - Err(MissingKey(_)) - if NON_INHERITED_RANGE.contains(&key) - || NON_INHERITED_SPARSE_RANGE.contains(&key) => - { - // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range. - // When we add more types of keys into the page server, we should revisit this part of code and throw errors - // accordingly. - key = key.next(); - } - Err(MissingKey(err)) => { - return Err(GetVectoredError::MissingKey(err)); - } - Err(Other(err)) - if err - .to_string() - .contains("downloading evicted layer file failed") => - { - return Err(GetVectoredError::Other(err)) - } - Err(Other(err)) - if err - .chain() - .any(|cause| cause.to_string().contains("layer loading failed")) => - { - // The intent here is to achieve error parity with the vectored read path. - // When vectored read fails to load a layer it fails the whole read, hence - // we mimic this behaviour here to keep the validation happy. - return Err(GetVectoredError::Other(err)); - } - _ => { - values.insert(key, block); - key = key.next(); - } - } - } - } - - Ok(values) - } - pub(super) async fn get_vectored_impl( &self, keyspace: KeySpace, @@ -1315,113 +1179,6 @@ impl Timeline { Ok(results) } - /// Not subject to [`Self::timeline_get_throttle`]. - pub(super) async fn validate_get_vectored_impl( - &self, - vectored_res: &Result>, GetVectoredError>, - keyspace: KeySpace, - lsn: Lsn, - ctx: &RequestContext, - ) { - if keyspace.overlaps(&Key::metadata_key_range()) { - // skip validation for metadata key range - return; - } - - let sequential_res = self - .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx) - .await; - - fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool { - use GetVectoredError::*; - match (lhs, rhs) { - (Oversized(l), Oversized(r)) => l == r, - (InvalidLsn(l), InvalidLsn(r)) => l == r, - (MissingKey(l), MissingKey(r)) => l.key == r.key, - (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true, - (Other(_), Other(_)) => true, - _ => false, - } - } - - match (&sequential_res, vectored_res) { - (Err(GetVectoredError::Cancelled), _) => {}, - (_, Err(GetVectoredError::Cancelled)) => {}, - (Err(seq_err), Ok(_)) => { - panic!(concat!("Sequential get failed with {}, but vectored get did not", - " - keyspace={:?} lsn={}"), - seq_err, keyspace, lsn) }, - (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => { - // Sequential get runs after vectored get, so it is possible for the later - // to time out while waiting for its ancestor's Lsn to become ready and for the - // former to succeed (it essentially has a doubled wait time). - }, - (Ok(_), Err(vec_err)) => { - panic!(concat!("Vectored get failed with {}, but sequential get did not", - " - keyspace={:?} lsn={}"), - vec_err, keyspace, lsn) }, - (Err(seq_err), Err(vec_err)) => { - assert!(errors_match(seq_err, vec_err), - "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")}, - (Ok(seq_values), Ok(vec_values)) => { - seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| { - assert_eq!(seq_key, vec_key); - match (seq_res, vec_res) { - (Ok(seq_blob), Ok(vec_blob)) => { - Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob); - }, - (Err(err), Ok(_)) => { - panic!( - concat!("Sequential get failed with {} for key {}, but vectored get did not", - " - keyspace={:?} lsn={}"), - err, seq_key, keyspace, lsn) }, - (Ok(_), Err(err)) => { - panic!( - concat!("Vectored get failed with {} for key {}, but sequential get did not", - " - keyspace={:?} lsn={}"), - err, seq_key, keyspace, lsn) }, - (Err(_), Err(_)) => {} - } - }) - } - } - } - - fn validate_key_equivalence( - key: &Key, - keyspace: &KeySpace, - lsn: Lsn, - seq: &Bytes, - vec: &Bytes, - ) { - if *key == AUX_FILES_KEY { - // The value reconstruct of AUX_FILES_KEY from records is not deterministic - // since it uses a hash map under the hood. Hence, deserialise both results - // before comparing. - let seq_aux_dir_res = AuxFilesDirectory::des(seq); - let vec_aux_dir_res = AuxFilesDirectory::des(vec); - match (&seq_aux_dir_res, &vec_aux_dir_res) { - (Ok(seq_aux_dir), Ok(vec_aux_dir)) => { - assert_eq!( - seq_aux_dir, vec_aux_dir, - "Mismatch for key {} - keyspace={:?} lsn={}", - key, keyspace, lsn - ); - } - (Err(_), Err(_)) => {} - _ => { - panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}"); - } - } - } else { - // All other keys should reconstruct deterministically, so we simply compare the blobs. - assert_eq!( - seq, vec, - "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}" - ); - } - } - /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. pub(crate) fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last @@ -1465,12 +1222,7 @@ impl Timeline { /// Hence, the result **does not represent local filesystem usage**. pub(crate) async fn layer_size_sum(&self) -> u64 { let guard = self.layers.read().await; - let layer_map = guard.layer_map(); - let mut size = 0; - for l in layer_map.iter_historic_layers() { - size += l.file_size; - } - size + guard.layer_size_sum() } pub(crate) fn resident_physical_size(&self) -> u64 { @@ -1637,16 +1389,15 @@ impl Timeline { // This exists to provide a non-span creating version of `freeze_and_flush` we can call without // polluting the span hierarchy. pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> { - let to_lsn = { + let token = { // Freeze the current open in-memory layer. It will be written to disk on next // iteration. let mut g = self.write_lock.lock().await; let to_lsn = self.get_last_record_lsn(); - self.freeze_inmem_layer_at(to_lsn, &mut g).await; - to_lsn + self.freeze_inmem_layer_at(to_lsn, &mut g).await? }; - self.flush_frozen_layers_and_wait(to_lsn).await + self.wait_flush_completion(token).await } // Check if an open ephemeral layer should be closed: this provides @@ -1660,12 +1411,20 @@ impl Timeline { return; }; + // FIXME: why not early exit? because before #7927 the state would had been cleared every + // time, and this was missed. + // if write_guard.is_none() { return; } + let Ok(layers_guard) = self.layers.try_read() else { // Don't block if the layer lock is busy return; }; - let Some(open_layer) = &layers_guard.layer_map().open_layer else { + let Ok(lm) = layers_guard.layer_map() else { + return; + }; + + let Some(open_layer) = &lm.open_layer else { // If there is no open layer, we have no layer freezing to do. However, we might need to generate // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions // that didn't result in writes to this shard. @@ -1691,9 +1450,16 @@ impl Timeline { ); // The flush loop will update remote consistent LSN as well as disk consistent LSN. - self.flush_frozen_layers_and_wait(last_record_lsn) - .await - .ok(); + // We know there is no open layer, so we can request freezing without actually + // freezing anything. This is true even if we have dropped the layers_guard, we + // still hold the write_guard. + let _ = async { + let token = self + .freeze_inmem_layer_at(last_record_lsn, &mut write_guard) + .await?; + self.wait_flush_completion(token).await + } + .await; } } @@ -1731,45 +1497,39 @@ impl Timeline { self.last_freeze_at.load(), open_layer.get_opened_at(), ) { - let at_lsn = match open_layer.info() { + match open_layer.info() { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { // We may reach this point if the layer was already frozen by not yet flushed: flushing // happens asynchronously in the background. tracing::debug!( "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})" ); - None } InMemoryLayerInfo::Open { .. } => { // Upgrade to a write lock and freeze the layer drop(layers_guard); - let mut layers_guard = self.layers.write().await; - let froze = layers_guard - .try_freeze_in_memory_layer( - current_lsn, - &self.last_freeze_at, - &mut write_guard, - ) + let res = self + .freeze_inmem_layer_at(current_lsn, &mut write_guard) .await; - Some(current_lsn).filter(|_| froze) - } - }; - if let Some(lsn) = at_lsn { - let res: Result = self.flush_frozen_layers(lsn); - if let Err(e) = res { - tracing::info!("failed to flush frozen layer after background freeze: {e:#}"); + + if let Err(e) = res { + tracing::info!( + "failed to flush frozen layer after background freeze: {e:#}" + ); + } } } } } - /// Outermost timeline compaction operation; downloads needed layers. + /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending + /// compaction tasks. pub(crate) async fn compact( self: &Arc, cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { // most likely the cancellation token is from background task, but in tests it could be the // request task as well. @@ -1789,8 +1549,8 @@ impl Timeline { // compaction task goes over it's period (20s) which is quite often in production. let (_guard, _permit) = tokio::select! { tuple = prepare => { tuple }, - _ = self.cancel.cancelled() => return Ok(()), - _ = cancel.cancelled() => return Ok(()), + _ = self.cancel.cancelled() => return Ok(false), + _ = cancel.cancelled() => return Ok(false), }; let last_record_lsn = self.get_last_record_lsn(); @@ -1798,11 +1558,14 @@ impl Timeline { // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); - return Ok(()); + return Ok(false); } match self.get_compaction_algorithm_settings().kind { - CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await, + CompactionAlgorithm::Tiered => { + self.compact_tiered(cancel, ctx).await?; + Ok(false) + } CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await, } } @@ -1907,6 +1670,11 @@ impl Timeline { // about corner cases like s3 suddenly hanging up? self.remote_client.shutdown().await; } + Err(FlushLayerError::Cancelled) => { + // this is likely the second shutdown, ignore silently. + // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 + debug_assert!(self.cancel.is_cancelled()); + } Err(e) => { // Non-fatal. Shutdown is infallible. Failures to flush just mean that // we have some extra WAL replay to do next time the timeline starts. @@ -1919,9 +1687,13 @@ impl Timeline { tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); + // Ensure Prevent new page service requests from starting. + self.handles.shutdown(); + // Transition the remote_client into a state where it's only useful for timeline deletion. // (The deletion use case is why we can't just hook up remote_client to Self::cancel).) self.remote_client.stop(); + // As documented in remote_client.stop()'s doc comment, it's our responsibility // to shut down the upload queue tasks. // TODO: fix that, task management should be encapsulated inside remote_client. @@ -1932,10 +1704,17 @@ impl Timeline { ) .await; - // TODO: work toward making this a no-op. See this funciton's doc comment for more context. + // TODO: work toward making this a no-op. See this function's doc comment for more context. tracing::debug!("Waiting for tasks..."); task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; + { + // Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate + // open. + let mut write_guard = self.write_lock.lock().await; + self.layers.write().await.shutdown(&mut write_guard); + } + // Finally wait until any gate-holders are complete. // // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks @@ -1991,6 +1770,11 @@ impl Timeline { self.current_state() == TimelineState::Active } + #[allow(unused)] + pub(crate) fn is_archived(&self) -> Option { + self.remote_client.is_archived() + } + pub(crate) fn is_stopping(&self) -> bool { self.current_state() == TimelineState::Stopping } @@ -2024,9 +1808,12 @@ impl Timeline { } } - pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { + pub(crate) async fn layer_map_info( + &self, + reset: LayerAccessStatsReset, + ) -> Result { let guard = self.layers.read().await; - let layer_map = guard.layer_map(); + let layer_map = guard.layer_map()?; let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); if let Some(open_layer) = &layer_map.open_layer { in_memory_layers.push(open_layer.info()); @@ -2035,16 +1822,15 @@ impl Timeline { in_memory_layers.push(frozen_layer.info()); } - let mut historic_layers = Vec::new(); - for historic_layer in layer_map.iter_historic_layers() { - let historic_layer = guard.get_from_desc(&historic_layer); - historic_layers.push(historic_layer.info(reset)); - } + let historic_layers = layer_map + .iter_historic_layers() + .map(|desc| guard.get_from_desc(&desc).info(reset)) + .collect(); - LayerMapInfo { + Ok(LayerMapInfo { in_memory_layers, historic_layers, - } + }) } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] @@ -2052,7 +1838,7 @@ impl Timeline { &self, layer_file_name: &LayerName, ) -> anyhow::Result> { - let Some(layer) = self.find_layer(layer_file_name).await else { + let Some(layer) = self.find_layer(layer_file_name).await? else { return Ok(None); }; @@ -2073,7 +1859,7 @@ impl Timeline { .enter() .map_err(|_| anyhow::anyhow!("Shutting down"))?; - let Some(local_layer) = self.find_layer(layer_file_name).await else { + let Some(local_layer) = self.find_layer(layer_file_name).await? else { return Ok(None); }; @@ -2439,6 +2225,8 @@ impl Timeline { extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())), l0_flush_global_state: resources.l0_flush_global_state, + + handles: Default::default(), }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; @@ -2557,7 +2345,10 @@ impl Timeline { let mut layers = self.layers.try_write().expect( "in the context where we call this function, no other task has access to the object", ); - layers.initialize_empty(Lsn(start_lsn.0)); + layers + .open_mut() + .expect("in this context the LayerManager must still be open") + .initialize_empty(Lsn(start_lsn.0)); } /// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only @@ -2689,7 +2480,10 @@ impl Timeline { let num_layers = loaded_layers.len(); - guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); + guard + .open_mut() + .expect("layermanager must be open during init") + .initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); self.remote_client .schedule_layer_file_deletion(&needs_cleanup)?; @@ -2722,6 +2516,10 @@ impl Timeline { // Tenant::create_timeline will wait for these uploads to happen before returning, or // on retry. + // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan) + drop(guard); // drop write lock, update_layer_visibility will take a read lock. + self.update_layer_visibility().await?; + info!( "loaded layer map with {} layers at {}, total physical size: {}", num_layers, disk_consistent_lsn, total_physical_size @@ -3142,16 +2940,17 @@ impl Timeline { } } - async fn find_layer(&self, layer_name: &LayerName) -> Option { + async fn find_layer( + &self, + layer_name: &LayerName, + ) -> Result, layer_manager::Shutdown> { let guard = self.layers.read().await; - for historic_layer in guard.layer_map().iter_historic_layers() { - let historic_layer_name = historic_layer.layer_name(); - if layer_name == &historic_layer_name { - return Some(guard.get_from_desc(&historic_layer)); - } - } - - None + let layer = guard + .layer_map()? + .iter_historic_layers() + .find(|l| &l.layer_name() == layer_name) + .map(|found| guard.get_from_desc(&found)); + Ok(layer) } /// The timeline heatmap is a hint to secondary locations from the primary location, @@ -3168,14 +2967,22 @@ impl Timeline { let guard = self.layers.read().await; - let resident = guard.likely_resident_layers().map(|layer| { - let last_activity_ts = layer.access_stats().latest_activity(); - - HeatMapLayer::new( - layer.layer_desc().layer_name(), - layer.metadata(), - last_activity_ts, - ) + let resident = guard.likely_resident_layers().filter_map(|layer| { + match layer.visibility() { + LayerVisibilityHint::Visible => { + // Layer is visible to one or more read LSNs: elegible for inclusion in layer map + let last_activity_ts = layer.latest_activity(); + Some(HeatMapLayer::new( + layer.layer_desc().layer_name(), + layer.metadata(), + last_activity_ts, + )) + } + LayerVisibilityHint::Covered => { + // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap. + None + } + } }); let layers = resident.collect(); @@ -3193,228 +3000,7 @@ impl Timeline { } } -type TraversalId = Arc; - -trait TraversalLayerExt { - fn traversal_id(&self) -> TraversalId; -} - -impl TraversalLayerExt for Layer { - fn traversal_id(&self) -> TraversalId { - Arc::clone(self.debug_str()) - } -} - -impl TraversalLayerExt for Arc { - fn traversal_id(&self) -> TraversalId { - Arc::clone(self.local_path_str()) - } -} - impl Timeline { - /// - /// Get a handle to a Layer for reading. - /// - /// The returned Layer might be from an ancestor timeline, if the - /// segment hasn't been updated on this timeline yet. - /// - /// This function takes the current timeline's locked LayerMap as an argument, - /// so callers can avoid potential race conditions. - /// - /// # Cancel-Safety - /// - /// This method is cancellation-safe. - async fn get_reconstruct_data( - &self, - key: Key, - request_lsn: Lsn, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> Result, PageReconstructError> { - // Start from the current timeline. - let mut timeline_owned; - let mut timeline = self; - - let mut read_count = scopeguard::guard(0, |cnt| { - crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64) - }); - - // For debugging purposes, collect the path of layers that we traversed - // through. It's included in the error message if we fail to find the key. - let mut traversal_path = Vec::::new(); - - let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { - *cached_lsn - } else { - Lsn(0) - }; - - // 'prev_lsn' tracks the last LSN that we were at in our search. It's used - // to check that each iteration make some progress, to break infinite - // looping if something goes wrong. - let mut prev_lsn = None; - - let mut result = ValueReconstructResult::Continue; - let mut cont_lsn = Lsn(request_lsn.0 + 1); - - 'outer: loop { - if self.cancel.is_cancelled() { - return Err(PageReconstructError::Cancelled); - } - - // The function should have updated 'state' - //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); - match result { - ValueReconstructResult::Complete => return Ok(traversal_path), - ValueReconstructResult::Continue => { - // If we reached an earlier cached page image, we're done. - if cont_lsn == cached_lsn + 1 { - return Ok(traversal_path); - } - if let Some(prev) = prev_lsn { - if prev <= cont_lsn { - // Didn't make any progress in last iteration. Error out to avoid - // getting stuck in the loop. - return Err(PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn: Lsn(cont_lsn.0 - 1), - request_lsn, - ancestor_lsn: Some(timeline.ancestor_lsn), - traversal_path, - backtrace: None, - })); - } - } - prev_lsn = Some(cont_lsn); - } - ValueReconstructResult::Missing => { - return Err(PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn, - request_lsn, - ancestor_lsn: None, - traversal_path, - backtrace: if cfg!(test) { - Some(std::backtrace::Backtrace::force_capture()) - } else { - None - }, - })); - } - } - - // Recurse into ancestor if needed - if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() { - if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - trace!( - "going into ancestor {}, cont_lsn is {}", - timeline.ancestor_lsn, - cont_lsn - ); - - timeline_owned = timeline - .get_ready_ancestor_timeline(ancestor_timeline, ctx) - .await?; - timeline = &*timeline_owned; - prev_lsn = None; - continue 'outer; - } - } - - let guard = timeline.layers.read().await; - let layers = guard.layer_map(); - - // Check the open and frozen in-memory layers first, in order from newest - // to oldest. - if let Some(open_layer) = &layers.open_layer { - let start_lsn = open_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display()); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, start_lsn); - - let open_layer = open_layer.clone(); - drop(guard); - - result = match open_layer - .get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - ctx, - ) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - *read_count += 1; - traversal_path.push((result, cont_lsn, open_layer.traversal_id())); - continue 'outer; - } - } - for frozen_layer in layers.frozen_layers.iter().rev() { - let start_lsn = frozen_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display()); - let lsn_floor = max(cached_lsn + 1, start_lsn); - - let frozen_layer = frozen_layer.clone(); - drop(guard); - - result = match frozen_layer - .get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - ctx, - ) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - *read_count += 1; - traversal_path.push((result, cont_lsn, frozen_layer.traversal_id())); - continue 'outer; - } - } - - if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { - let layer = guard.get_from_desc(&layer); - drop(guard); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = match layer - .get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state, ctx) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - *read_count += 1; - traversal_path.push((result, cont_lsn, layer.traversal_id())); - continue 'outer; - } else if timeline.ancestor_timeline.is_some() { - // Nothing on this timeline. Traverse to parent - result = ValueReconstructResult::Continue; - cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); - continue 'outer; - } else { - // Nothing found - result = ValueReconstructResult::Missing; - continue 'outer; - } - } - } - #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace @@ -3509,7 +3095,6 @@ impl Timeline { cont_lsn, request_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), - traversal_path: vec![], backtrace: None, })); } @@ -3568,7 +3153,7 @@ impl Timeline { // which turns out to be a perf bottleneck in some cases. if !unmapped_keyspace.is_empty() { let guard = timeline.layers.read().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; let in_memory_layer = layers.find_in_memory_layer(|l| { let start_lsn = l.get_lsn_range().start; @@ -3701,30 +3286,50 @@ impl Timeline { Ok(ancestor.clone()) } - pub(crate) fn get_ancestor_timeline(&self) -> Option> { - self.ancestor_timeline.clone() - } - pub(crate) fn get_shard_identity(&self) -> &ShardIdentity { &self.shard_identity } + #[inline(always)] + pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId { + ShardTimelineId { + shard_index: ShardIndex { + shard_number: self.shard_identity.number, + shard_count: self.shard_identity.count, + }, + timeline_id: self.timeline_id, + } + } + + /// Returns a non-frozen open in-memory layer for ingestion. /// - /// Get a handle to the latest layer for appending. - /// + /// Takes a witness of timeline writer state lock being held, because it makes no sense to call + /// this function without holding the mutex. async fn get_layer_for_write( &self, lsn: Lsn, + _guard: &tokio::sync::MutexGuard<'_, Option>, ctx: &RequestContext, ) -> anyhow::Result> { let mut guard = self.layers.write().await; + let gate_guard = self.gate.enter().context("enter gate for inmem layer")?; + + let last_record_lsn = self.get_last_record_lsn(); + ensure!( + lsn > last_record_lsn, + "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", + lsn, + last_record_lsn, + ); + let layer = guard + .open_mut()? .get_layer_for_write( lsn, - self.get_last_record_lsn(), self.conf, self.timeline_id, self.tenant_shard_id, + gate_guard, ctx, ) .await?; @@ -3738,21 +3343,48 @@ impl Timeline { self.last_record_lsn.advance(new_lsn); } + /// Freeze any existing open in-memory layer and unconditionally notify the flush loop. + /// + /// Unconditional flush loop notification is given because in sharded cases we will want to + /// leave an Lsn gap. Unsharded tenants do not have Lsn gaps. async fn freeze_inmem_layer_at( &self, at: Lsn, write_lock: &mut tokio::sync::MutexGuard<'_, Option>, - ) { + ) -> Result { let frozen = { let mut guard = self.layers.write().await; guard + .open_mut()? .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock) .await }; + if frozen { let now = Instant::now(); *(self.last_freeze_ts.write().unwrap()) = now; } + + // Increment the flush cycle counter and wake up the flush task. + // Remember the new value, so that when we listen for the flush + // to finish, we know when the flush that we initiated has + // finished, instead of some other flush that was started earlier. + let mut my_flush_request = 0; + + let flush_loop_state = { *self.flush_loop_state.lock().unwrap() }; + if !matches!(flush_loop_state, FlushLoopState::Running { .. }) { + return Err(FlushLayerError::NotRunning(flush_loop_state)); + } + + self.layer_flush_start_tx.send_modify(|(counter, lsn)| { + my_flush_request = *counter + 1; + *counter = my_flush_request; + *lsn = std::cmp::max(at, *lsn); + }); + + assert_ne!(my_flush_request, 0); + + Ok(my_flush_request) } /// Layer flusher task's main loop. @@ -3789,7 +3421,11 @@ impl Timeline { let layer_to_flush = { let guard = self.layers.read().await; - guard.layer_map().frozen_layers.front().cloned() + let Ok(lm) = guard.layer_map() else { + info!("dropping out of flush loop for timeline shutdown"); + return; + }; + lm.frozen_layers.front().cloned() // drop 'layers' lock to allow concurrent reads and writes }; let Some(layer_to_flush) = layer_to_flush else { @@ -3846,34 +3482,7 @@ impl Timeline { } } - /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk. - /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`]. - /// - /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the - /// case, it means no data will be written between the top of the highest frozen layer and - /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data - /// locally for that part of the WAL. - fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result { - // Increment the flush cycle counter and wake up the flush task. - // Remember the new value, so that when we listen for the flush - // to finish, we know when the flush that we initiated has - // finished, instead of some other flush that was started earlier. - let mut my_flush_request = 0; - - let flush_loop_state = { *self.flush_loop_state.lock().unwrap() }; - if !matches!(flush_loop_state, FlushLoopState::Running { .. }) { - return Err(FlushLayerError::NotRunning(flush_loop_state)); - } - - self.layer_flush_start_tx.send_modify(|(counter, lsn)| { - my_flush_request = *counter + 1; - *counter = my_flush_request; - *lsn = std::cmp::max(at_lsn, *lsn); - }); - - Ok(my_flush_request) - } - + /// Waits any flush request created by [`Self::freeze_inmem_layer_at`] to complete. async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> { let mut rx = self.layer_flush_done_tx.subscribe(); loop { @@ -3906,11 +3515,6 @@ impl Timeline { } } - async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> { - let token = self.flush_frozen_layers(at_lsn)?; - self.wait_flush_completion(token).await - } - /// Flush one frozen in-memory layer to disk, as a new delta layer. /// /// Return value is the last lsn (inclusive) of the layer that was frozen. @@ -4047,11 +3651,11 @@ impl Timeline { { let mut guard = self.layers.write().await; - if self.cancel.is_cancelled() { - return Err(FlushLayerError::Cancelled); - } - - guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics); + guard.open_mut()?.finish_flush_l0_layer( + delta_layer_to_add.as_ref(), + &frozen_layer, + &self.metrics, + ); if self.set_disk_consistent_lsn(disk_consistent_lsn) { // Schedule remote uploads that will reflect our new disk_consistent_lsn @@ -4061,6 +3665,21 @@ impl Timeline { // release lock on 'layers' }; + // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files. + // This makes us refuse ingest until the new layers have been persisted to the remote. + self.remote_client + .wait_completion() + .await + .map_err(|e| match e { + WaitCompletionError::UploadQueueShutDownOrStopped + | WaitCompletionError::NotInitialized( + NotInitialized::ShuttingDown | NotInitialized::Stopped, + ) => FlushLayerError::Cancelled, + WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => { + FlushLayerError::Other(anyhow!(e).into()) + } + })?; + // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`, // a compaction can delete the file and then it won't be available for uploads any more. // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this @@ -4076,17 +3695,11 @@ impl Timeline { /// Return true if the value changed /// - /// This function must only be used from the layer flush task, and may not be called concurrently. + /// This function must only be used from the layer flush task. fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { - // We do a simple load/store cycle: that's why this function isn't safe for concurrent use. - let old_value = self.disk_consistent_lsn.load(); - if new_value != old_value { - assert!(new_value >= old_value); - self.disk_consistent_lsn.store(new_value); - true - } else { - false - } + let old_value = self.disk_consistent_lsn.fetch_max(new_value); + assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"); + new_value != old_value } /// Update metadata file @@ -4153,12 +3766,14 @@ impl Timeline { let frozen_layer = Arc::clone(frozen_layer); let ctx = ctx.attached_child(); let work = async move { - let Some(new_delta) = frozen_layer - .write_to_disk(&self_clone, &ctx, key_range) + let Some((desc, path)) = frozen_layer + .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner()) .await? else { return Ok(None); }; + let new_delta = Layer::finish_creating(self_clone.conf, &self_clone, desc, &path)?; + // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. @@ -4248,7 +3863,9 @@ impl Timeline { let threshold = self.get_image_creation_threshold(); let guard = self.layers.read().await; - let layers = guard.layer_map(); + let Ok(layers) = guard.layer_map() else { + return false; + }; let mut max_deltas = 0; for part_range in &partition.ranges { @@ -4653,35 +4270,20 @@ impl Timeline { } } - // The writer.finish() above already did the fsync of the inodes. - // We just need to fsync the directory in which these inodes are linked, - // which we know to be the timeline directory. - if !image_layers.is_empty() { - // We use fatal_err() below because the after writer.finish() returns with success, - // the in-memory state of the filesystem already has the layer file in its final place, - // and subsequent pageserver code could think it's durable while it really isn't. - let timeline_dir = VirtualFile::open( - &self - .conf - .timeline_path(&self.tenant_shard_id, &self.timeline_id), - ctx, - ) - .await - .fatal_err("VirtualFile::open for timeline dir fsync"); - timeline_dir - .sync_all() - .await - .fatal_err("VirtualFile::sync_all timeline dir"); - } - let mut guard = self.layers.write().await; // FIXME: we could add the images to be uploaded *before* returning from here, but right - // now they are being scheduled outside of write lock - guard.track_new_image_layers(&image_layers, &self.metrics); + // now they are being scheduled outside of write lock; current way is inconsistent with + // compaction lock order. + guard + .open_mut()? + .track_new_image_layers(&image_layers, &self.metrics); drop_wlock(guard); timer.stop_and_record(); + // Creating image layers may have caused some previously visible layers to be covered + self.update_layer_visibility().await?; + Ok(image_layers) } @@ -4699,6 +4301,12 @@ impl Timeline { return; } + if self.current_logical_size.current_size().is_exact() { + // root timelines are initialized with exact count, but never start the background + // calculation + return; + } + if let Some(await_bg_cancel) = self .current_logical_size .cancel_wait_for_background_loop_concurrency_limit_semaphore @@ -4754,7 +4362,7 @@ impl Timeline { tenant: &crate::tenant::Tenant, prepared: detach_ancestor::PreparedTimelineDetach, ctx: &RequestContext, - ) -> Result, anyhow::Error> { + ) -> Result, anyhow::Error> { detach_ancestor::complete(self, tenant, prepared, ctx).await } @@ -4786,7 +4394,7 @@ pub(crate) enum CompactionError { ShuttingDown, /// Compaction cannot be done right now; page reconstruction and so on. #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), } impl From for CompactionError { @@ -4801,6 +4409,44 @@ impl From for CompactionError { } } +impl From for CompactionError { + fn from(value: super::upload_queue::NotInitialized) -> Self { + match value { + super::upload_queue::NotInitialized::Uninitialized + | super::upload_queue::NotInitialized::Stopped => { + CompactionError::Other(anyhow::anyhow!(value)) + } + super::upload_queue::NotInitialized::ShuttingDown => CompactionError::ShuttingDown, + } + } +} + +impl CompactionError { + /// We cannot do compaction because we could not download a layer that is input to the compaction. + pub(crate) fn input_layer_download_failed( + e: super::storage_layer::layer::DownloadError, + ) -> Self { + match e { + super::storage_layer::layer::DownloadError::TimelineShutdown | + /* TODO DownloadCancelled correct here? */ + super::storage_layer::layer::DownloadError::DownloadCancelled => CompactionError::ShuttingDown, + super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads | + super::storage_layer::layer::DownloadError::DownloadRequired | + super::storage_layer::layer::DownloadError::NotFile(_) | + super::storage_layer::layer::DownloadError::DownloadFailed | + super::storage_layer::layer::DownloadError::PreStatFailed(_)=>CompactionError::Other(anyhow::anyhow!(e)), + #[cfg(test)] + super::storage_layer::layer::DownloadError::Failpoint(_) => CompactionError::Other(anyhow::anyhow!(e)), + } + } +} + +impl From for CompactionError { + fn from(_: layer_manager::Shutdown) -> Self { + CompactionError::ShuttingDown + } +} + #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); @@ -4874,7 +4520,7 @@ impl Timeline { new_deltas: &[ResidentLayer], new_images: &[ResidentLayer], layers_to_remove: &[Layer], - ) -> anyhow::Result<()> { + ) -> Result<(), CompactionError> { let mut guard = self.layers.write().await; let mut duplicated_layers = HashSet::new(); @@ -4892,7 +4538,7 @@ impl Timeline { // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); } else if LayerMap::is_l0(&l.layer_desc().key_range) { - bail!("compaction generates a L0 layer file as output, which will cause infinite compaction."); + return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); } else { insert_layers.push(l.clone()); } @@ -4906,11 +4552,14 @@ impl Timeline { .collect(); if !new_images.is_empty() { - guard.track_new_image_layers(new_images, &self.metrics); + guard + .open_mut()? + .track_new_image_layers(new_images, &self.metrics); } - // deletion will happen later, the layer file manager calls garbage_collect_on_drop - guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); + guard + .open_mut()? + .finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); self.remote_client .schedule_compaction_update(&remove_layers, new_deltas)?; @@ -4924,7 +4573,7 @@ impl Timeline { self: &Arc, mut replace_layers: Vec<(Layer, ResidentLayer)>, mut drop_layers: Vec, - ) -> anyhow::Result<()> { + ) -> Result<(), CompactionError> { let mut guard = self.layers.write().await; // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want @@ -4932,7 +4581,9 @@ impl Timeline { replace_layers.retain(|(l, _)| guard.contains(l)); drop_layers.retain(|l| guard.contains(l)); - guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics); + guard + .open_mut()? + .rewrite_layers(&replace_layers, &drop_layers, &self.metrics); let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); @@ -4946,7 +4597,7 @@ impl Timeline { fn upload_new_image_layers( self: &Arc, new_images: impl IntoIterator, - ) -> anyhow::Result<()> { + ) -> Result<(), super::upload_queue::NotInitialized> { for layer in new_images { self.remote_client.schedule_layer_file_upload(layer)?; } @@ -5221,7 +4872,7 @@ impl Timeline { // // TODO holding a write lock is too agressive and avoidable let mut guard = self.layers.write().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; 'outer: for l in layers.iter_historic_layers() { result.layers_total += 1; @@ -5349,7 +5000,7 @@ impl Timeline { } })?; - guard.finish_gc_timeline(&gc_layers); + guard.open_mut()?.finish_gc_timeline(&gc_layers); #[cfg(feature = "testing")] { @@ -5414,20 +5065,22 @@ impl Timeline { } else { trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); }; - - let img = match self + let res = self .walredo_mgr .as_ref() .context("timeline has no walredo manager") .map_err(PageReconstructError::WalRedo)? .request_redo(key, request_lsn, data.img, data.records, self.pg_version) - .await - .context("reconstruct a page image") - { + .await; + let img = match res { Ok(img) => img, - Err(e) => return Err(PageReconstructError::WalRedo(e)), + Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), + Err(walredo::Error::Other(e)) => { + return Err(PageReconstructError::WalRedo( + e.context("reconstruct a page image"), + )) + } }; - Ok(img) } } @@ -5503,9 +5156,13 @@ impl Timeline { let remaining = { let guard = self.layers.read().await; - guard - .layer_map() - .iter_historic_layers() + let Ok(lm) = guard.layer_map() else { + // technically here we could look into iterating accessible layers, but downloading + // all layers of a shutdown timeline makes no sense regardless. + tracing::info!("attempted to download all layers of shutdown timeline"); + return; + }; + lm.iter_historic_layers() .map(|desc| guard.get_from_desc(&desc)) .collect::>() }; @@ -5612,10 +5269,10 @@ impl Timeline { let file_size = layer.layer_desc().file_size; max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); - let last_activity_ts = layer.access_stats().latest_activity(); + let last_activity_ts = layer.latest_activity(); EvictionCandidate { - layer: layer.into(), + layer: layer.to_owned().into(), last_activity_ts, relative_last_activity: finite_f32::FiniteF32::ZERO, } @@ -5635,6 +5292,22 @@ impl Timeline { } } + /// Persistently blocks gc for `Manual` reason. + /// + /// Returns true if no such block existed before, false otherwise. + pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result { + use crate::tenant::remote_timeline_client::index::GcBlockingReason; + assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id); + tenant.gc_block.insert(self, GcBlockingReason::Manual).await + } + + /// Persistently unblocks gc for `Manual` reason. + pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> { + use crate::tenant::remote_timeline_client::index::GcBlockingReason; + assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id); + tenant.gc_block.remove(self, GcBlockingReason::Manual).await + } + #[cfg(test)] pub(super) fn force_advance_lsn(self: &Arc, new_lsn: Lsn) { self.last_record_lsn.advance(new_lsn); @@ -5684,7 +5357,7 @@ impl Timeline { { let mut guard = self.layers.write().await; - guard.force_insert_layer(image_layer); + guard.open_mut().unwrap().force_insert_layer(image_layer); } Ok(()) @@ -5728,7 +5401,7 @@ impl Timeline { } let guard = self.layers.read().await; - for layer in guard.layer_map().iter_historic_layers() { + for layer in guard.layer_map()?.iter_historic_layers() { if layer.is_delta() && overlaps_with(&layer.lsn_range, &deltas.lsn_range) && layer.lsn_range != deltas.lsn_range @@ -5753,13 +5426,12 @@ impl Timeline { for (key, lsn, val) in deltas.data { delta_layer_writer.put_value(key, lsn, val, ctx).await?; } - let delta_layer = delta_layer_writer - .finish(deltas.key_range.end, self, ctx) - .await?; + let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?; { let mut guard = self.layers.write().await; - guard.force_insert_layer(delta_layer); + guard.open_mut().unwrap().force_insert_layer(delta_layer); } Ok(()) @@ -5774,7 +5446,7 @@ impl Timeline { ) -> anyhow::Result> { let mut all_data = Vec::new(); let guard = self.layers.read().await; - for layer in guard.layer_map().iter_historic_layers() { + for layer in guard.layer_map()?.iter_historic_layers() { if !layer.is_delta() && layer.image_layer_lsn() == lsn { let layer = guard.get_from_desc(&layer); let mut reconstruct_data = ValuesReconstructState::default(); @@ -5802,7 +5474,7 @@ impl Timeline { ) -> anyhow::Result> { let mut layers = Vec::new(); let guard = self.layers.read().await; - for layer in guard.layer_map().iter_historic_layers() { + for layer in guard.layer_map()?.iter_historic_layers() { layers.push(layer.key()); } Ok(layers) @@ -5816,12 +5488,10 @@ impl Timeline { } } -type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId); - /// Tracking writes ingestion does to a particular in-memory layer. /// /// Cleared upon freezing a layer. -struct TimelineWriterState { +pub(crate) struct TimelineWriterState { open_layer: Arc, current_size: u64, // Previous Lsn which passed through @@ -5929,7 +5599,10 @@ impl<'a> TimelineWriter<'a> { } async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> { - let layer = self.tl.get_layer_for_write(at, ctx).await?; + let layer = self + .tl + .get_layer_for_write(at, &self.write_guard, ctx) + .await?; let initial_size = layer.size().await?; let last_freeze_at = self.last_freeze_at.load(); @@ -5942,15 +5615,15 @@ impl<'a> TimelineWriter<'a> { Ok(()) } - async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> { + async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> { let current_size = self.write_guard.as_ref().unwrap().current_size; // self.write_guard will be taken by the freezing self.tl .freeze_inmem_layer_at(freeze_at, &mut self.write_guard) - .await; + .await?; - self.tl.flush_frozen_layers(freeze_at)?; + assert!(self.write_guard.is_none()); if current_size >= self.get_checkpoint_distance() * 2 { warn!("Flushed oversized open layer with size {}", current_size) @@ -6115,6 +5788,7 @@ mod tests { let layers = timeline.layers.read().await; let desc = layers .layer_map() + .unwrap() .iter_historic_layers() .next() .expect("must find one layer to evict"); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index d0a74e3924..87ec46c0b5 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -4,7 +4,7 @@ //! //! The old legacy algorithm is implemented directly in `timeline.rs`. -use std::collections::BinaryHeap; +use std::collections::{BinaryHeap, HashSet}; use std::ops::{Deref, Range}; use std::sync::Arc; @@ -15,11 +15,14 @@ use super::{ }; use anyhow::{anyhow, Context}; +use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; +use pageserver_api::key::KEY_SIZE; use pageserver_api::keyspace::ShardedRange; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; +use serde::Serialize; use tokio_util::sync::CancellationToken; use tracing::{debug, info, info_span, trace, warn, Instrument}; use utils::id::TimelineId; @@ -27,16 +30,20 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD}; +use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::merge_iterator::MergeIterator; -use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState}; +use crate::tenant::storage_layer::{ + AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, +}; +use crate::tenant::timeline::ImageLayerCreationOutcome; use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; -use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::DeltaLayer; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use crate::keyspace::KeySpace; use crate::repository::{Key, Value}; +use crate::walrecord::NeonWalRecord; use utils::lsn::Lsn; @@ -68,47 +75,151 @@ impl KeyHistoryRetention { self, key: Key, delta_writer: &mut Vec<(Key, Lsn, Value)>, - image_writer: &mut ImageLayerWriter, + mut image_writer: Option<&mut ImageLayerWriter>, + stat: &mut CompactionStatistics, ctx: &RequestContext, ) -> anyhow::Result<()> { let mut first_batch = true; - for (_, KeyLogAtLsn(logs)) in self.below_horizon { + for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon { if first_batch { if logs.len() == 1 && logs[0].1.is_image() { let Value::Image(img) = &logs[0].1 else { unreachable!() }; - image_writer.put_image(key, img.clone(), ctx).await?; + stat.produce_image_key(img); + if let Some(image_writer) = image_writer.as_mut() { + image_writer.put_image(key, img.clone(), ctx).await?; + } else { + delta_writer.push((key, cutoff_lsn, Value::Image(img.clone()))); + } } else { for (lsn, val) in logs { + stat.produce_key(&val); delta_writer.push((key, lsn, val)); } } first_batch = false; } else { for (lsn, val) in logs { + stat.produce_key(&val); delta_writer.push((key, lsn, val)); } } } let KeyLogAtLsn(above_horizon_logs) = self.above_horizon; for (lsn, val) in above_horizon_logs { + stat.produce_key(&val); delta_writer.push((key, lsn, val)); } Ok(()) } } +#[derive(Debug, Serialize, Default)] +struct CompactionStatisticsNumSize { + num: u64, + size: u64, +} + +#[derive(Debug, Serialize, Default)] +pub struct CompactionStatistics { + delta_layer_visited: CompactionStatisticsNumSize, + image_layer_visited: CompactionStatisticsNumSize, + delta_layer_produced: CompactionStatisticsNumSize, + image_layer_produced: CompactionStatisticsNumSize, + num_delta_layer_discarded: usize, + num_image_layer_discarded: usize, + num_unique_keys_visited: usize, + wal_keys_visited: CompactionStatisticsNumSize, + image_keys_visited: CompactionStatisticsNumSize, + wal_produced: CompactionStatisticsNumSize, + image_produced: CompactionStatisticsNumSize, +} + +impl CompactionStatistics { + fn estimated_size_of_value(val: &Value) -> usize { + match val { + Value::Image(img) => img.len(), + Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(), + _ => std::mem::size_of::(), + } + } + fn estimated_size_of_key() -> usize { + KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer) + } + fn visit_delta_layer(&mut self, size: u64) { + self.delta_layer_visited.num += 1; + self.delta_layer_visited.size += size; + } + fn visit_image_layer(&mut self, size: u64) { + self.image_layer_visited.num += 1; + self.image_layer_visited.size += size; + } + fn on_unique_key_visited(&mut self) { + self.num_unique_keys_visited += 1; + } + fn visit_wal_key(&mut self, val: &Value) { + self.wal_keys_visited.num += 1; + self.wal_keys_visited.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn visit_image_key(&mut self, val: &Value) { + self.image_keys_visited.num += 1; + self.image_keys_visited.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn produce_key(&mut self, val: &Value) { + match val { + Value::Image(img) => self.produce_image_key(img), + Value::WalRecord(_) => self.produce_wal_key(val), + } + } + fn produce_wal_key(&mut self, val: &Value) { + self.wal_produced.num += 1; + self.wal_produced.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn produce_image_key(&mut self, val: &Bytes) { + self.image_produced.num += 1; + self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64; + } + fn discard_delta_layer(&mut self) { + self.num_delta_layer_discarded += 1; + } + fn discard_image_layer(&mut self) { + self.num_image_layer_discarded += 1; + } + fn produce_delta_layer(&mut self, size: u64) { + self.delta_layer_produced.num += 1; + self.delta_layer_produced.size += size; + } + fn produce_image_layer(&mut self, size: u64) { + self.image_layer_produced.num += 1; + self.image_layer_produced.size += size; + } +} + impl Timeline { /// TODO: cancellation + /// + /// Returns whether the compaction has pending tasks. pub(crate) async fn compact_legacy( self: &Arc, cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) { - return self.compact_with_gc(cancel, ctx).await; + self.compact_with_gc(cancel, flags, ctx) + .await + .map_err(CompactionError::Other)?; + return Ok(false); + } + + if flags.contains(CompactFlags::DryRun) { + return Err(CompactionError::Other(anyhow!( + "dry-run mode is not supported for legacy compaction for now" + ))); } // High level strategy for compaction / image creation: @@ -156,7 +267,7 @@ impl Timeline { // Define partitioning schema if needed // FIXME: the match should only cover repartitioning, not the next steps - let partition_count = match self + let (partition_count, has_pending_tasks) = match self .repartition( self.get_last_record_lsn(), self.get_compaction_target_size(), @@ -173,30 +284,35 @@ impl Timeline { // 2. Compact let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(target_file_size, ctx).await?; + let fully_compacted = self.compact_level0(target_file_size, ctx).await?; timer.stop_and_record(); - // 3. Create new image layers for partitions that have been modified - // "enough". let mut partitioning = dense_partitioning; partitioning .parts .extend(sparse_partitioning.into_dense().parts); - let image_layers = self - .create_image_layers( - &partitioning, - lsn, - if flags.contains(CompactFlags::ForceImageLayerCreation) { - ImageLayerCreationMode::Force - } else { - ImageLayerCreationMode::Try - }, - &image_ctx, - ) - .await?; - self.upload_new_image_layers(image_layers)?; - partitioning.parts.len() + // 3. Create new image layers for partitions that have been modified + // "enough". Skip image layer creation if L0 compaction cannot keep up. + if fully_compacted { + let image_layers = self + .create_image_layers( + &partitioning, + lsn, + if flags.contains(CompactFlags::ForceImageLayerCreation) { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, + &image_ctx, + ) + .await?; + + self.upload_new_image_layers(image_layers)?; + } else { + info!("skipping image layer generation due to L0 compaction did not include all layers."); + } + (partitioning.parts.len(), !fully_compacted) } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -208,7 +324,7 @@ impl Timeline { if !self.cancel.is_cancelled() { tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); } - 1 + (1, false) } }; @@ -221,7 +337,7 @@ impl Timeline { self.compact_shard_ancestors(rewrite_max, ctx).await?; } - Ok(()) + Ok(has_pending_tasks) } /// Check for layers that are elegible to be rewritten: @@ -236,7 +352,7 @@ impl Timeline { self: &Arc, rewrite_max: usize, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), CompactionError> { let mut drop_layers = Vec::new(); let mut layers_to_rewrite: Vec = Vec::new(); @@ -255,7 +371,7 @@ impl Timeline { ); let layers = self.layers.read().await; - for layer_desc in layers.layer_map().iter_historic_layers() { + for layer_desc in layers.layer_map()?.iter_historic_layers() { let layer = layers.get_from_desc(&layer_desc); if layer.metadata().shard.shard_count == self.shard_identity.count { // This layer does not belong to a historic ancestor, no need to re-image it. @@ -357,7 +473,8 @@ impl Timeline { layer.layer_desc().image_layer_lsn(), ctx, ) - .await?; + .await + .map_err(CompactionError::Other)?; // Safety of layer rewrites: // - We are writing to a different local file path than we are reading from, so the old Layer @@ -372,14 +489,20 @@ impl Timeline { // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: // - GC, which at worst witnesses us "undelete" a layer that they just deleted. // - ingestion, which only inserts layers, therefore cannot collide with us. - let resident = layer.download_and_keep_resident().await?; + let resident = layer + .download_and_keep_resident() + .await + .map_err(CompactionError::input_layer_download_failed)?; let keys_written = resident .filter(&self.shard_identity, &mut image_layer_writer, ctx) .await?; if keys_written > 0 { - let new_layer = image_layer_writer.finish(self, ctx).await?; + let new_layer = image_layer_writer + .finish(self, ctx) + .await + .map_err(CompactionError::Other)?; tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", layer.metadata().file_size, new_layer.metadata().file_size); @@ -407,23 +530,72 @@ impl Timeline { // necessary for correctness, but it simplifies testing, and avoids proceeding with another // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O // load. - self.remote_client.wait_completion().await?; + match self.remote_client.wait_completion().await { + Ok(()) => (), + Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), + Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { + return Err(CompactionError::ShuttingDown) + } + } fail::fail_point!("compact-shard-ancestors-persistent"); Ok(()) } + /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is + /// an image layer between them and the most recent readable LSN (branch point or tip of timeline). The + /// purpose of the visibility hint is to record which layers need to be available to service reads. + /// + /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers + /// that we know won't be needed for reads. + pub(super) async fn update_layer_visibility( + &self, + ) -> Result<(), super::layer_manager::Shutdown> { + let head_lsn = self.get_last_record_lsn(); + + // We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas + // are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here. + // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that + // they will be subject to L0->L1 compaction in the near future. + let layer_manager = self.layers.read().await; + let layer_map = layer_manager.layer_map()?; + + let readable_points = { + let children = self.gc_info.read().unwrap().retain_lsns.clone(); + + let mut readable_points = Vec::with_capacity(children.len() + 1); + for (child_lsn, _child_timeline_id) in &children { + readable_points.push(*child_lsn); + } + readable_points.push(head_lsn); + readable_points + }; + + let (layer_visibility, covered) = layer_map.get_visibility(readable_points); + for (layer_desc, visibility) in layer_visibility { + // FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one + let layer = layer_manager.get_from_desc(&layer_desc); + layer.set_visibility(visibility); + } + + // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can + // avoid assuming that everything at a branch point is visible. + drop(covered); + Ok(()) + } + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as - /// as Level 1 files. + /// as Level 1 files. Returns whether the L0 layers are fully compacted. async fn compact_level0( self: &Arc, target_file_size: u64, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { let CompactLevel0Phase1Result { new_layers, deltas_to_compact, + fully_compacted, } = { let phase1_span = info_span!("compact_level0_phase1"); let ctx = ctx.attached_child(); @@ -446,12 +618,12 @@ impl Timeline { if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do - return Ok(()); + return Ok(true); } self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact) .await?; - Ok(()) + Ok(fully_compacted) } /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. @@ -464,12 +636,8 @@ impl Timeline { ) -> Result { stats.read_lock_held_spawn_blocking_startup_micros = stats.read_lock_acquisition_micros.till_now(); // set by caller - let layers = guard.layer_map(); - let level0_deltas = layers.get_level0_deltas()?; - let mut level0_deltas = level0_deltas - .into_iter() - .map(|x| guard.get_from_desc(&x)) - .collect_vec(); + let layers = guard.layer_map()?; + let level0_deltas = layers.level0_deltas(); stats.level0_deltas_count = Some(level0_deltas.len()); // Only compact if enough layers have accumulated. @@ -482,6 +650,11 @@ impl Timeline { return Ok(CompactLevel0Phase1Result::default()); } + let mut level0_deltas = level0_deltas + .iter() + .map(|x| guard.get_from_desc(x)) + .collect::>(); + // Gather the files to compact in this iteration. // // Start with the oldest Level 0 delta file, and collect any other @@ -518,14 +691,25 @@ impl Timeline { ) as u64 * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE); - deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); + let mut fully_compacted = true; + + deltas_to_compact.push( + first_level0_delta + .download_and_keep_resident() + .await + .map_err(CompactionError::input_layer_download_failed)?, + ); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; if lsn_range.start != prev_lsn_end { break; } - deltas_to_compact.push(l.download_and_keep_resident().await?); + deltas_to_compact.push( + l.download_and_keep_resident() + .await + .map_err(CompactionError::input_layer_download_failed)?, + ); deltas_to_compact_bytes += l.metadata().file_size; prev_lsn_end = lsn_range.end; @@ -536,6 +720,7 @@ impl Timeline { "L0 compaction picker hit max delta layer size limit: {}", delta_size_limit ); + fully_compacted = false; // Proceed with compaction, but only a subset of L0s break; @@ -571,66 +756,230 @@ impl Timeline { .read_lock_held_spawn_blocking_startup_micros .till_now(); - // Determine N largest holes where N is number of compacted layers. - let max_holes = deltas_to_compact.len(); - let last_record_lsn = self.get_last_record_lsn(); - let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; - let min_hole_coverage_size = 3; // TODO: something more flexible? - - // min-heap (reserve space for one more element added before eviction) - let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); - let mut prev: Option = None; - - let mut all_keys = Vec::new(); - - for l in deltas_to_compact.iter() { - all_keys.extend(l.load_keys(ctx).await?); - } - - // FIXME: should spawn_blocking the rest of this function - - // The current stdlib sorting implementation is designed in a way where it is - // particularly fast where the slice is made up of sorted sub-ranges. - all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); + // TODO: replace with streaming k-merge + let all_keys = { + let mut all_keys = Vec::new(); + for l in deltas_to_compact.iter() { + all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?); + } + // The current stdlib sorting implementation is designed in a way where it is + // particularly fast where the slice is made up of sorted sub-ranges. + all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); + all_keys + }; stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); - for &DeltaEntry { key: next_key, .. } in all_keys.iter() { - if let Some(prev_key) = prev { - // just first fast filter, do not create hole entries for metadata keys. The last hole in the - // compaction is the gap between data key and metadata keys. - if next_key.to_i128() - prev_key.to_i128() >= min_hole_range - && !Key::is_metadata_key(&prev_key) - { - let key_range = prev_key..next_key; - // Measuring hole by just subtraction of i128 representation of key range boundaries - // has not so much sense, because largest holes will corresponds field1/field2 changes. - // But we are mostly interested to eliminate holes which cause generation of excessive image layers. - // That is why it is better to measure size of hole as number of covering image layers. - let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len(); - if coverage_size >= min_hole_coverage_size { - heap.push(Hole { - key_range, - coverage_size, - }); - if heap.len() > max_holes { - heap.pop(); // remove smallest hole + // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start. + // + // A hole is a key range for which this compaction doesn't have any WAL records. + // Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range, + // cover the hole, but actually don't contain any WAL records for that key range. + // The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`). + // That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records. + // + // The algorithm chooses holes as follows. + // - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys). + // - Filter: min threshold on range length + // - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data) + // + // For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451 + #[derive(PartialEq, Eq)] + struct Hole { + key_range: Range, + coverage_size: usize, + } + let holes: Vec = { + use std::cmp::Ordering; + impl Ord for Hole { + fn cmp(&self, other: &Self) -> Ordering { + self.coverage_size.cmp(&other.coverage_size).reverse() + } + } + impl PartialOrd for Hole { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + let max_holes = deltas_to_compact.len(); + let last_record_lsn = self.get_last_record_lsn(); + let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; + let min_hole_coverage_size = 3; // TODO: something more flexible? + // min-heap (reserve space for one more element added before eviction) + let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); + let mut prev: Option = None; + + for &DeltaEntry { key: next_key, .. } in all_keys.iter() { + if let Some(prev_key) = prev { + // just first fast filter, do not create hole entries for metadata keys. The last hole in the + // compaction is the gap between data key and metadata keys. + if next_key.to_i128() - prev_key.to_i128() >= min_hole_range + && !Key::is_metadata_key(&prev_key) + { + let key_range = prev_key..next_key; + // Measuring hole by just subtraction of i128 representation of key range boundaries + // has not so much sense, because largest holes will corresponds field1/field2 changes. + // But we are mostly interested to eliminate holes which cause generation of excessive image layers. + // That is why it is better to measure size of hole as number of covering image layers. + let coverage_size = + layers.image_coverage(&key_range, last_record_lsn).len(); + if coverage_size >= min_hole_coverage_size { + heap.push(Hole { + key_range, + coverage_size, + }); + if heap.len() > max_holes { + heap.pop(); // remove smallest hole + } } } } + prev = Some(next_key.next()); } - prev = Some(next_key.next()); - } + let mut holes = heap.into_vec(); + holes.sort_unstable_by_key(|hole| hole.key_range.start); + holes + }; stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); drop_rlock(guard); stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); - let mut holes = heap.into_vec(); - holes.sort_unstable_by_key(|hole| hole.key_range.start); - let mut next_hole = 0; // index of next hole in holes vector // This iterator walks through all key-value pairs from all the layers // we're compacting, in key, LSN order. - let all_values_iter = all_keys.iter(); + // If there's both a Value::Image and Value::WalRecord for the same (key,lsn), + // then the Value::Image is ordered before Value::WalRecord. + // + // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io + // option and validation code once we've reached confidence. + enum AllValuesIter<'a> { + PageCachedBlobIo { + all_keys_iter: VecIter<'a>, + }, + StreamingKmergeBypassingPageCache { + merge_iter: MergeIterator<'a>, + }, + ValidatingStreamingKmergeBypassingPageCache { + mode: CompactL0BypassPageCacheValidation, + merge_iter: MergeIterator<'a>, + all_keys_iter: VecIter<'a>, + }, + } + type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes + impl AllValuesIter<'_> { + async fn next_all_keys_iter( + iter: &mut VecIter<'_>, + ctx: &RequestContext, + ) -> anyhow::Result> { + let Some(DeltaEntry { + key, + lsn, + val: value_ref, + .. + }) = iter.next() + else { + return Ok(None); + }; + let value = value_ref.load(ctx).await?; + Ok(Some((*key, *lsn, value))) + } + async fn next( + &mut self, + ctx: &RequestContext, + ) -> anyhow::Result> { + match self { + AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => { + Self::next_all_keys_iter(iter, ctx).await + } + AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await, + AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async { + // advance both iterators + let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await; + let merge_iter_item = merge_iter.next().await; + // compare results & log warnings as needed + macro_rules! rate_limited_warn { + ($($arg:tt)*) => {{ + if cfg!(debug_assertions) || cfg!(feature = "testing") { + warn!($($arg)*); + panic!("CompactL0BypassPageCacheValidation failure, check logs"); + } + use once_cell::sync::Lazy; + use utils::rate_limit::RateLimit; + use std::sync::Mutex; + use std::time::Duration; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!($($arg)*); + }); + }} + } + match (&all_keys_iter_item, &merge_iter_item) { + (Err(_), Err(_)) => { + // don't bother asserting equivality of the errors + } + (Err(all_keys), Ok(merge)) => { + rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}"); + }, + (Ok(all_keys), Err(merge)) => { + rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}"); + }, + (Ok(None), Ok(None)) => { } + (Ok(Some(all_keys)), Ok(None)) => { + rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some"); + } + (Ok(None), Ok(Some(merge))) => { + rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some"); + } + (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => { + match mode { + // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one + CompactL0BypassPageCacheValidation::KeyLsn => { + let all_keys = (all_keys_key, all_keys_lsn); + let merge = (merge_key, merge_lsn); + if all_keys != merge { + rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter"); + } + } + CompactL0BypassPageCacheValidation::KeyLsnValue => { + let all_keys = (all_keys_key, all_keys_lsn, all_keys_value); + let merge = (merge_key, merge_lsn, merge_value); + if all_keys != merge { + rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter"); + } + } + } + } + } + // in case of mismatch, trust the legacy all_keys_iter_item + all_keys_iter_item + }.instrument(info_span!("next")).await + } + } + } + let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access { + CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo { + all_keys_iter: all_keys.iter(), + }, + CompactL0Phase1ValueAccess::StreamingKmerge { validate } => { + let merge_iter = { + let mut deltas = Vec::with_capacity(deltas_to_compact.len()); + for l in deltas_to_compact.iter() { + let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; + deltas.push(l); + } + MergeIterator::create(&deltas, &[], ctx) + }; + match validate { + None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter }, + Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { + mode: validate.clone(), + merge_iter, + all_keys_iter: all_keys.iter(), + }, + } + } + }; // This iterator walks through all keys and is needed to calculate size used by each key let mut all_keys_iter = all_keys @@ -701,12 +1050,13 @@ impl Timeline { let mut key_values_total_size = 0u64; let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key + let mut next_hole = 0; // index of next hole in holes vector - for &DeltaEntry { - key, lsn, ref val, .. - } in all_values_iter + while let Some((key, lsn, value)) = all_values_iter + .next(ctx) + .await + .map_err(CompactionError::Other)? { - let value = val.load(ctx).await?; let same_key = prev_key.map_or(false, |prev_key| prev_key == key); // We need to check key boundaries once we reach next key or end of layer with the same key if !same_key || lsn == dup_end_lsn { @@ -758,13 +1108,16 @@ impl Timeline { || contains_hole { // ... if so, flush previous layer and prepare to write new one - new_layers.push( - writer - .take() - .unwrap() - .finish(prev_key.unwrap().next(), self, ctx) - .await?, - ); + let (desc, path) = writer + .take() + .unwrap() + .finish(prev_key.unwrap().next(), ctx) + .await + .map_err(CompactionError::Other)?; + let new_delta = Layer::finish_creating(self.conf, self, desc, &path) + .map_err(CompactionError::Other)?; + + new_layers.push(new_delta); writer = None; if contains_hole { @@ -801,7 +1154,8 @@ impl Timeline { }, ctx, ) - .await?, + .await + .map_err(CompactionError::Other)?, ); } @@ -809,7 +1163,8 @@ impl Timeline { .as_mut() .unwrap() .put_value(key, lsn, value, ctx) - .await?; + .await + .map_err(CompactionError::Other)?; } else { debug!( "Dropping key {} during compaction (it belongs on shard {:?})", @@ -825,7 +1180,13 @@ impl Timeline { prev_key = Some(key); } if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?); + let (desc, path) = writer + .finish(prev_key.unwrap().next(), ctx) + .await + .map_err(CompactionError::Other)?; + let new_delta = Layer::finish_creating(self.conf, self, desc, &path) + .map_err(CompactionError::Other)?; + new_layers.push(new_delta); } // Sync layers @@ -883,12 +1244,17 @@ impl Timeline { } } + // Without this, rustc complains about deltas_to_compact still + // being borrowed when we `.into_iter()` below. + drop(all_values_iter); + Ok(CompactLevel0Phase1Result { new_layers, deltas_to_compact: deltas_to_compact .into_iter() .map(|x| x.drop_eviction_guard()) .collect::>(), + fully_compacted, }) } } @@ -897,6 +1263,9 @@ impl Timeline { struct CompactLevel0Phase1Result { new_layers: Vec, deltas_to_compact: Vec, + // Whether we have included all L0 layers, or selected only part of them due to the + // L0 compaction size limit. + fully_compacted: bool, } #[derive(Default)] @@ -986,6 +1355,43 @@ impl TryFrom for CompactLevel0Phase1Stats { } } +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +pub enum CompactL0Phase1ValueAccess { + /// The old way. + PageCachedBlobIo, + /// The new way. + StreamingKmerge { + /// If set, we run both the old way and the new way, validate that + /// they are identical (=> [`CompactL0BypassPageCacheValidation`]), + /// and if the validation fails, + /// - in tests: fail them with a panic or + /// - in prod, log a rate-limited warning and use the old way's results. + /// + /// If not set, we only run the new way and trust its results. + validate: Option, + }, +} + +/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`]. +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(rename_all = "kebab-case")] +pub enum CompactL0BypassPageCacheValidation { + /// Validate that the series of (key, lsn) pairs are the same. + KeyLsn, + /// Validate that the entire output of old and new way is identical. + KeyLsnValue, +} + +impl Default for CompactL0Phase1ValueAccess { + fn default() -> Self { + CompactL0Phase1ValueAccess::StreamingKmerge { + // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident + validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue), + } + } +} + impl Timeline { /// Entry point for new tiered compaction algorithm. /// @@ -1005,10 +1411,9 @@ impl Timeline { // Find the top of the historical layers let end_lsn = { let guard = self.layers.read().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; - let l0_deltas = layers.get_level0_deltas()?; - drop(guard); + let l0_deltas = layers.level0_deltas(); // As an optimization, if we find that there are too few L0 layers, // bail out early. We know that the compaction algorithm would do @@ -1037,7 +1442,9 @@ impl Timeline { fanout, ctx, ) - .await?; + .await + // TODO: compact_tiered needs to return CompactionError + .map_err(CompactionError::Other)?; adaptor.flush_updates().await?; Ok(()) @@ -1067,28 +1474,30 @@ impl Timeline { pub(crate) async fn generate_key_retention( self: &Arc, key: Key, - history: &[(Key, Lsn, Value)], + full_history: &[(Key, Lsn, Value)], horizon: Lsn, retain_lsn_below_horizon: &[Lsn], delta_threshold_cnt: usize, + base_img_from_ancestor: Option<(Key, Lsn, Bytes)>, ) -> anyhow::Result { // Pre-checks for the invariants if cfg!(debug_assertions) { - for (log_key, _, _) in history { + for (log_key, _, _) in full_history { assert_eq!(log_key, &key, "mismatched key"); } - for i in 1..history.len() { - assert!(history[i - 1].1 <= history[i].1, "unordered LSN"); - if history[i - 1].1 == history[i].1 { + for i in 1..full_history.len() { + assert!(full_history[i - 1].1 <= full_history[i].1, "unordered LSN"); + if full_history[i - 1].1 == full_history[i].1 { assert!( - matches!(history[i - 1].2, Value::Image(_)), + matches!(full_history[i - 1].2, Value::Image(_)), "unordered delta/image, or duplicated delta" ); } } - if let Value::WalRecord(rec) = &history[0].2 { - assert!(rec.will_init(), "no base image"); - } + // There was an assertion for no base image that checks if the first + // record in the history is `will_init` before, but it was removed. + // This is explained in the test cases for generate_key_retention. + // Search "incomplete history" for more information. for lsn in retain_lsn_below_horizon { assert!(lsn < &horizon, "retain lsn must be below horizon") } @@ -1099,6 +1508,7 @@ impl Timeline { ); } } + let has_ancestor = base_img_from_ancestor.is_some(); // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon, // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket. let (mut split_history, lsn_split_points) = { @@ -1110,7 +1520,7 @@ impl Timeline { } lsn_split_points.push(horizon); let mut current_idx = 0; - for item @ (_, lsn, _) in history { + for item @ (_, lsn, _) in full_history { while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] { current_idx += 1; } @@ -1132,6 +1542,9 @@ impl Timeline { // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply // dropped. + // + // TODO: in case we have both delta + images for a given LSN and it does not exceed the delta + // threshold, we could have kept delta instead to save space. This is an optimization for the future. continue; } } @@ -1149,9 +1562,75 @@ impl Timeline { "should have at least below + above horizon batches" ); let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new(); + if let Some((key, lsn, img)) = base_img_from_ancestor { + replay_history.push((key, lsn, Value::Image(img))); + } + + /// Generate debug information for the replay history + fn generate_history_trace(replay_history: &[(Key, Lsn, Value)]) -> String { + use std::fmt::Write; + let mut output = String::new(); + if let Some((key, _, _)) = replay_history.first() { + write!(output, "key={} ", key).unwrap(); + let mut cnt = 0; + for (_, lsn, val) in replay_history { + if val.is_image() { + write!(output, "i@{} ", lsn).unwrap(); + } else if val.will_init() { + write!(output, "di@{} ", lsn).unwrap(); + } else { + write!(output, "d@{} ", lsn).unwrap(); + } + cnt += 1; + if cnt >= 128 { + write!(output, "... and more").unwrap(); + break; + } + } + } else { + write!(output, "").unwrap(); + } + output + } + + fn generate_debug_trace( + replay_history: Option<&[(Key, Lsn, Value)]>, + full_history: &[(Key, Lsn, Value)], + lsns: &[Lsn], + horizon: Lsn, + ) -> String { + use std::fmt::Write; + let mut output = String::new(); + if let Some(replay_history) = replay_history { + writeln!( + output, + "replay_history: {}", + generate_history_trace(replay_history) + ) + .unwrap(); + } else { + writeln!(output, "replay_history: ",).unwrap(); + } + writeln!( + output, + "full_history: {}", + generate_history_trace(full_history) + ) + .unwrap(); + writeln!( + output, + "when processing: [{}] horizon={}", + lsns.iter().map(|l| format!("{l}")).join(","), + horizon + ) + .unwrap(); + output + } + for (i, split_for_lsn) in split_history.into_iter().enumerate() { + // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly. records_since_last_image += split_for_lsn.len(); - let generate_image = if i == 0 { + let generate_image = if i == 0 && !has_ancestor { // We always generate images for the first batch (below horizon / lowest retain_lsn) true } else if i == batch_cnt - 1 { @@ -1164,9 +1643,6 @@ impl Timeline { false }; replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone())); - if let Some((_, _, val)) = replay_history.first() { - assert!(val.will_init(), "invalid history, no base image"); - } // Only retain the items after the last image record for idx in (0..replay_history.len()).rev() { if replay_history[idx].2.will_init() { @@ -1174,8 +1650,28 @@ impl Timeline { break; } } + if let Some((_, _, val)) = replay_history.first() { + if !val.will_init() { + return Err(anyhow::anyhow!("invalid history, no base image")).with_context( + || { + generate_debug_trace( + Some(&replay_history), + full_history, + retain_lsn_below_horizon, + horizon, + ) + }, + ); + } + } if generate_image && records_since_last_image > 0 { records_since_last_image = 0; + let replay_history_for_debug = if cfg!(debug_assertions) { + Some(replay_history.clone()) + } else { + None + }; + let replay_history_for_debug_ref = replay_history_for_debug.as_deref(); let history = std::mem::take(&mut replay_history); let mut img = None; let mut records = Vec::with_capacity(history.len()); @@ -1183,14 +1679,30 @@ impl Timeline { img = Some((*lsn, val.clone())); for (_, lsn, val) in history.into_iter().skip(1) { let Value::WalRecord(rec) = val else { - panic!("invalid record") + return Err(anyhow::anyhow!( + "invalid record, first record is image, expect walrecords" + )) + .with_context(|| { + generate_debug_trace( + replay_history_for_debug_ref, + full_history, + retain_lsn_below_horizon, + horizon, + ) + }); }; records.push((lsn, rec)); } } else { for (_, lsn, val) in history.into_iter() { let Value::WalRecord(rec) = val else { - panic!("invalid record") + return Err(anyhow::anyhow!("invalid record, first record is walrecord, expect rest are walrecord")) + .with_context(|| generate_debug_trace( + replay_history_for_debug_ref, + full_history, + retain_lsn_below_horizon, + horizon, + )); }; records.push((lsn, rec)); } @@ -1202,12 +1714,11 @@ impl Timeline { replay_history.push((key, request_lsn, Value::Image(img.clone()))); retention.push(vec![(request_lsn, Value::Image(img))]); } else { - retention.push( - split_for_lsn - .iter() - .map(|(_, lsn, value)| (*lsn, value.clone())) - .collect(), - ); + let deltas = split_for_lsn + .iter() + .map(|(_, lsn, value)| (*lsn, value.clone())) + .collect_vec(); + retention.push(deltas); } } let mut result = Vec::with_capacity(retention.len()); @@ -1222,7 +1733,7 @@ impl Timeline { result.push((lsn_split_points[idx], KeyLogAtLsn(logs))); } } - unreachable!() + unreachable!("key retention is empty") } /// An experimental compaction building block that combines compaction with garbage collection. @@ -1233,24 +1744,48 @@ impl Timeline { /// and create delta layers with all deltas >= gc horizon. pub(crate) async fn compact_with_gc( self: &Arc, - _cancel: &CancellationToken, + cancel: &CancellationToken, + flags: EnumSet, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> anyhow::Result<()> { use std::collections::BTreeSet; - info!("running enhanced gc bottom-most compaction"); + // Block other compaction/GC tasks from running for now. GC-compaction could run along + // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. + // Note that we already acquired the compaction lock when the outer `compact` function gets called. + + let gc_lock = async { + tokio::select! { + guard = self.gc_lock.lock() => Ok(guard), + // TODO: refactor to CompactionError to correctly pass cancelled error + _ = cancel.cancelled() => Err(anyhow!("cancelled")), + } + }; + + let gc_lock = crate::timed( + gc_lock, + "acquires gc lock", + std::time::Duration::from_secs(5), + ) + .await?; + + let dry_run = flags.contains(CompactFlags::DryRun); + + info!("running enhanced gc bottom-most compaction, dry_run={dry_run}"); scopeguard::defer! { info!("done enhanced gc bottom-most compaction"); }; + let mut stat = CompactionStatistics::default(); + // Step 0: pick all delta layers + image layers below/intersect with the GC horizon. // The layer selection has the following properties: // 1. If a layer is in the selection, all layers below it are in the selection. // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = { let guard = self.layers.read().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; let gc_info = self.gc_info.read().unwrap(); let mut retain_lsns_below_horizon = Vec::new(); let gc_cutoff = gc_info.cutoffs.select_min(); @@ -1274,20 +1809,25 @@ impl Timeline { retain_lsns_below_horizon.sort(); (selected_layers, gc_cutoff, retain_lsns_below_horizon) }; - let lowest_retain_lsn = retain_lsns_below_horizon - .first() - .copied() - .unwrap_or(gc_cutoff); - if cfg!(debug_assertions) { - assert_eq!( - lowest_retain_lsn, - retain_lsns_below_horizon - .iter() - .min() - .copied() - .unwrap_or(gc_cutoff) - ); - } + let lowest_retain_lsn = if self.ancestor_timeline.is_some() { + Lsn(self.ancestor_lsn.0 + 1) + } else { + let res = retain_lsns_below_horizon + .first() + .copied() + .unwrap_or(gc_cutoff); + if cfg!(debug_assertions) { + assert_eq!( + res, + retain_lsns_below_horizon + .iter() + .min() + .copied() + .unwrap_or(gc_cutoff) + ); + } + res + }; info!( "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}", layer_selection.len(), @@ -1309,6 +1849,9 @@ impl Timeline { let key_range = desc.get_key_range(); delta_split_points.insert(key_range.start); delta_split_points.insert(key_range.end); + stat.visit_delta_layer(desc.file_size()); + } else { + stat.visit_image_layer(desc.file_size()); } } let mut delta_layers = Vec::new(); @@ -1328,6 +1871,14 @@ impl Timeline { let mut accumulated_values = Vec::new(); let mut last_key: Option = None; + enum FlushDeltaResult { + /// Create a new resident layer + CreateResidentLayer(ResidentLayer), + /// Keep an original delta layer + KeepLayer(PersistentLayerKey), + } + + #[allow(clippy::too_many_arguments)] async fn flush_deltas( deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>, last_key: Key, @@ -1336,7 +1887,10 @@ impl Timeline { tline: &Arc, lowest_retain_lsn: Lsn, ctx: &RequestContext, - ) -> anyhow::Result> { + stats: &mut CompactionStatistics, + dry_run: bool, + last_batch: bool, + ) -> anyhow::Result> { // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid // overlapping layers. // @@ -1356,46 +1910,176 @@ impl Timeline { *current_delta_split_point += 1; need_split = true; } - if !need_split { + if !need_split && !last_batch { return Ok(None); } - let deltas = std::mem::take(deltas); + let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas); if deltas.is_empty() { return Ok(None); } let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1; + let delta_key = PersistentLayerKey { + key_range: { + let key_start = deltas.first().unwrap().0; + let key_end = deltas.last().unwrap().0.next(); + key_start..key_end + }, + lsn_range: lowest_retain_lsn..end_lsn, + is_delta: true, + }; + { + // Hack: skip delta layer if we need to produce a layer of a same key-lsn. + // + // This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range. + // For example, consider the case where a single delta with range [0x10,0x50) exists. + // And we have branches at LSN 0x10, 0x20, 0x30. + // Then we delete branch @ 0x20. + // Bottom-most compaction may now delete the delta [0x20,0x30). + // And that wouldnt' change the shape of the layer. + // + // Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes. + // That's why it's safe to skip. + let guard = tline.layers.read().await; + + if guard.contains_key(&delta_key) { + let layer_generation = guard.get_from_key(&delta_key).metadata().generation; + drop(guard); + if layer_generation == tline.generation { + stats.discard_delta_layer(); + // TODO: depending on whether we design this compaction process to run along with + // other compactions, there could be layer map modifications after we drop the + // layer guard, and in case it creates duplicated layer key, we will still error + // in the end. + info!( + key=%delta_key, + ?layer_generation, + "discard delta layer due to duplicated layer in the same generation" + ); + return Ok(Some(FlushDeltaResult::KeepLayer(delta_key))); + } + } + } + let mut delta_layer_writer = DeltaLayerWriter::new( tline.conf, tline.timeline_id, tline.tenant_shard_id, - deltas.first().unwrap().0, + delta_key.key_range.start, lowest_retain_lsn..end_lsn, ctx, ) .await?; - let key_end = deltas.last().unwrap().0.next(); for (key, lsn, val) in deltas { delta_layer_writer.put_value(key, lsn, val, ctx).await?; } - let delta_layer = delta_layer_writer.finish(key_end, tline, ctx).await?; - Ok(Some(delta_layer)) + + stats.produce_delta_layer(delta_layer_writer.size()); + if dry_run { + return Ok(None); + } + + let (desc, path) = delta_layer_writer + .finish(delta_key.key_range.end, ctx) + .await?; + let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?; + Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer))) } - let mut image_layer_writer = ImageLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - &(Key::MIN..Key::MAX), // covers the full key range - lowest_retain_lsn, - ctx, - ) - .await?; + // Hack the key range to be min..(max-1). Otherwise, the image layer will be + // interpreted as an L0 delta layer. + let hack_image_layer_range = { + let mut end_key = Key::MAX; + end_key.field6 -= 1; + Key::MIN..end_key + }; + + // Only create image layers when there is no ancestor branches. TODO: create covering image layer + // when some condition meet. + let mut image_layer_writer = if self.ancestor_timeline.is_none() { + Some( + ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &hack_image_layer_range, // covers the full key range + lowest_retain_lsn, + ctx, + ) + .await?, + ) + } else { + None + }; + + /// Returns None if there is no ancestor branch. Throw an error when the key is not found. + /// + /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image + /// is needed for reconstruction. This should be fixed in the future. + /// + /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor + /// images. + async fn get_ancestor_image( + tline: &Arc, + key: Key, + ctx: &RequestContext, + ) -> anyhow::Result> { + if tline.ancestor_timeline.is_none() { + return Ok(None); + }; + // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing + // as much existing code as possible. + let img = tline.get(key, tline.ancestor_lsn, ctx).await?; + Ok(Some((key, tline.ancestor_lsn, img))) + } + let image_layer_key = PersistentLayerKey { + key_range: hack_image_layer_range, + lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn), + is_delta: false, + }; + + // Like with delta layers, it can happen that we re-produce an already existing image layer. + // This could happen when a user triggers force compaction and image generation. In this case, + // it's always safe to rewrite the layer. + let discard_image_layer = { + let guard = self.layers.read().await; + if guard.contains_key(&image_layer_key) { + let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation; + drop(guard); + if layer_generation == self.generation { + // TODO: depending on whether we design this compaction process to run along with + // other compactions, there could be layer map modifications after we drop the + // layer guard, and in case it creates duplicated layer key, we will still error + // in the end. + info!( + key=%image_layer_key, + ?layer_generation, + "discard image layer due to duplicated layer key in the same generation", + ); + true + } else { + false + } + } else { + false + } + }; + + // Actually, we can decide not to write to the image layer at all at this point because + // the key and LSN range are determined. However, to keep things simple here, we still + // create this writer, and discard the writer in the end. let mut delta_values = Vec::new(); let delta_split_points = delta_split_points.into_iter().collect_vec(); let mut current_delta_split_point = 0; let mut delta_layers = Vec::new(); while let Some((key, lsn, val)) = merge_iter.next().await? { + if cancel.is_cancelled() { + return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error + } + match val { + Value::Image(_) => stat.visit_image_key(&val), + Value::WalRecord(_) => stat.visit_wal_key(&val), + } if last_key.is_none() || last_key.as_ref() == Some(&key) { if last_key.is_none() { last_key = Some(key); @@ -1403,6 +2087,7 @@ impl Timeline { accumulated_values.push((key, lsn, val)); } else { let last_key = last_key.as_mut().unwrap(); + stat.on_unique_key_visited(); let retention = self .generate_key_retention( *last_key, @@ -1410,11 +2095,18 @@ impl Timeline { gc_cutoff, &retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, + get_ancestor_image(self, *last_key, ctx).await?, ) .await?; // Put the image into the image layer. Currently we have a single big layer for the compaction. retention - .pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx) + .pipe_to( + *last_key, + &mut delta_values, + image_layer_writer.as_mut(), + &mut stat, + ctx, + ) .await?; delta_layers.extend( flush_deltas( @@ -1425,6 +2117,9 @@ impl Timeline { self, lowest_retain_lsn, ctx, + &mut stat, + dry_run, + false, ) .await?, ); @@ -1436,6 +2131,7 @@ impl Timeline { let last_key = last_key.expect("no keys produced during compaction"); // TODO: move this part to the loop body + stat.on_unique_key_visited(); let retention = self .generate_key_retention( last_key, @@ -1443,11 +2139,18 @@ impl Timeline { gc_cutoff, &retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, + get_ancestor_image(self, last_key, ctx).await?, ) .await?; // Put the image into the image layer. Currently we have a single big layer for the compaction. retention - .pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx) + .pipe_to( + last_key, + &mut delta_values, + image_layer_writer.as_mut(), + &mut stat, + ctx, + ) .await?; delta_layers.extend( flush_deltas( @@ -1458,27 +2161,73 @@ impl Timeline { self, lowest_retain_lsn, ctx, + &mut stat, + dry_run, + true, ) .await?, ); + assert!(delta_values.is_empty(), "unprocessed keys"); + + let image_layer = if discard_image_layer { + stat.discard_image_layer(); + None + } else if let Some(writer) = image_layer_writer { + stat.produce_image_layer(writer.size()); + if !dry_run { + Some(writer.finish(self, ctx).await?) + } else { + None + } + } else { + None + }; + + info!( + "gc-compaction statistics: {}", + serde_json::to_string(&stat)? + ); + + if dry_run { + return Ok(()); + } - let image_layer = image_layer_writer.finish(self, ctx).await?; info!( "produced {} delta layers and {} image layers", delta_layers.len(), - 1 + if image_layer.is_some() { 1 } else { 0 } ); let mut compact_to = Vec::new(); - compact_to.extend(delta_layers); - compact_to.push(image_layer); + let mut keep_layers = HashSet::new(); + for action in delta_layers { + match action { + FlushDeltaResult::CreateResidentLayer(layer) => { + compact_to.push(layer); + } + FlushDeltaResult::KeepLayer(l) => { + keep_layers.insert(l); + } + } + } + if discard_image_layer { + keep_layers.insert(image_layer_key); + } + let mut layer_selection = layer_selection; + layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); + compact_to.extend(image_layer); + // Step 3: Place back to the layer map. { let mut guard = self.layers.write().await; - guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics) + guard + .open_mut()? + .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics) }; - self.remote_client .schedule_compaction_update(&layer_selection, &compact_to)?; + + drop(gc_lock); + Ok(()) } } @@ -1504,7 +2253,7 @@ impl TimelineAdaptor { } } - pub async fn flush_updates(&mut self) -> anyhow::Result<()> { + pub async fn flush_updates(&mut self) -> Result<(), CompactionError> { let layers_to_delete = { let guard = self.timeline.layers.read().await; self.layers_to_delete @@ -1552,7 +2301,7 @@ impl CompactionJobExecutor for TimelineAdaptor { self.flush_updates().await?; let guard = self.timeline.layers.read().await; - let layer_map = guard.layer_map(); + let layer_map = guard.layer_map()?; let result = layer_map .iter_historic_layers() @@ -1675,9 +2424,9 @@ impl CompactionJobExecutor for TimelineAdaptor { )) }); - let new_delta_layer = writer - .finish(prev.unwrap().0.next(), &self.timeline, ctx) - .await?; + let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?; + let new_delta_layer = + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; self.new_deltas.push(new_delta_layer); Ok(()) diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index ab6a5f20ba..b03dbb092e 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -63,10 +63,19 @@ pub(super) async fn delete_local_timeline_directory( tenant_shard_id: TenantShardId, timeline: &Timeline, ) -> anyhow::Result<()> { - let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) }; - let guards = crate::timed( - guards, - "acquire gc and compaction locks", + // Always ensure the lock order is compaction -> gc. + let compaction_lock = timeline.compaction_lock.lock(); + let compaction_lock = crate::timed( + compaction_lock, + "acquires compaction lock", + std::time::Duration::from_secs(5), + ) + .await; + + let gc_lock = timeline.gc_lock.lock(); + let gc_lock = crate::timed( + gc_lock, + "acquires gc lock", std::time::Duration::from_secs(5), ) .await; @@ -107,7 +116,8 @@ pub(super) async fn delete_local_timeline_directory( .context("fsync_pre_mark_remove")?; info!("finished deleting layer files, releasing locks"); - drop(guards); + drop(gc_lock); + drop(compaction_lock); fail::fail_point!("timeline-delete-after-rm", |_| { Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? @@ -206,11 +216,10 @@ impl DeleteTimelineFlow { // NB: If this fails half-way through, and is retried, the retry will go through // all the same steps again. Make sure the code here is idempotent, and don't // error out if some of the shutdown tasks have already been completed! - #[instrument(skip_all, fields(%inplace))] + #[instrument(skip_all)] pub async fn run( tenant: &Arc, timeline_id: TimelineId, - inplace: bool, ) -> Result<(), DeleteTimelineError> { super::debug_assert_current_span_has_tenant_and_timeline_id(); @@ -221,6 +230,8 @@ impl DeleteTimelineFlow { // Now that the Timeline is in Stopping state, request all the related tasks to shut down. timeline.shutdown(super::ShutdownMode::Hard).await; + tenant.gc_block.before_delete(&timeline); + fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { Err(anyhow::anyhow!( "failpoint: timeline-delete-before-index-deleted-at" @@ -235,11 +246,7 @@ impl DeleteTimelineFlow { ))? }); - if inplace { - Self::background(guard, tenant.conf, tenant, &timeline).await? - } else { - Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline); - } + Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline); Ok(()) } diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 49ce3db3e6..3b52adc77b 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::{collections::HashSet, sync::Arc}; use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; use crate::{ @@ -26,7 +26,7 @@ pub(crate) enum Error { #[error("flushing failed")] FlushAncestor(#[source] FlushLayerError), #[error("layer download failed")] - RewrittenDeltaDownloadFailed(#[source] anyhow::Error), + RewrittenDeltaDownloadFailed(#[source] crate::tenant::storage_layer::layer::DownloadError), #[error("copying LSN prefix locally failed")] CopyDeltaPrefix(#[source] anyhow::Error), #[error("upload rewritten layer")] @@ -74,6 +74,11 @@ impl From for Error { Error::ShuttingDown } } +impl From for Error { + fn from(_: super::layer_manager::Shutdown) -> Self { + Error::ShuttingDown + } +} impl From for Error { fn from(value: FlushLayerError) -> Self { @@ -141,50 +146,9 @@ pub(super) async fn prepare( } } - // detached has previously been detached; let's inspect each of the current timelines and - // report back the timelines which have been reparented by our detach - let mut all_direct_children = tenant - .timelines - .lock() - .unwrap() - .values() - .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached))) - .map(|tl| (tl.ancestor_lsn, tl.clone())) - .collect::>(); - - let mut any_shutdown = false; - - all_direct_children.retain( - |(_, tl)| match tl.remote_client.initialized_upload_queue() { - Ok(accessor) => accessor - .latest_uploaded_index_part() - .lineage - .is_reparented(), - Err(_shutdownalike) => { - // not 100% a shutdown, but let's bail early not to give inconsistent results in - // sharded enviroment. - any_shutdown = true; - true - } - }, - ); - - if any_shutdown { - // it could be one or many being deleted; have client retry - return Err(Error::ShuttingDown); - } - - let mut reparented = all_direct_children; - // why this instead of hashset? there is a reason, but I've forgotten it many times. - // - // maybe if this was a hashset we would not be able to distinguish some race condition. - reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id)); - + let reparented_timelines = reparented_direct_children(detached, tenant)?; return Ok(Progress::Done(AncestorDetached { - reparented_timelines: reparented - .into_iter() - .map(|(_, tl)| tl.timeline_id) - .collect(), + reparented_timelines, })); }; @@ -277,7 +241,7 @@ pub(super) async fn prepare( // between retries, these can change if compaction or gc ran in between. this will mean // we have to redo work. - partition_work(ancestor_lsn, &layers) + partition_work(ancestor_lsn, &layers)? }; // TODO: layers are already sorted by something: use that to determine how much of remote @@ -381,16 +345,67 @@ pub(super) async fn prepare( Ok(Progress::Prepared(guard, prepared)) } +fn reparented_direct_children( + detached: &Arc, + tenant: &Tenant, +) -> Result, Error> { + let mut all_direct_children = tenant + .timelines + .lock() + .unwrap() + .values() + .filter_map(|tl| { + let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)); + + if is_direct_child { + Some(tl.clone()) + } else { + if let Some(timeline) = tl.ancestor_timeline.as_ref() { + assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live"); + } + None + } + }) + // Collect to avoid lock taking order problem with Tenant::timelines and + // Timeline::remote_client + .collect::>(); + + let mut any_shutdown = false; + + all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() { + Ok(accessor) => accessor + .latest_uploaded_index_part() + .lineage + .is_reparented(), + Err(_shutdownalike) => { + // not 100% a shutdown, but let's bail early not to give inconsistent results in + // sharded enviroment. + any_shutdown = true; + true + } + }); + + if any_shutdown { + // it could be one or many being deleted; have client retry + return Err(Error::ShuttingDown); + } + + Ok(all_direct_children + .into_iter() + .map(|tl| tl.timeline_id) + .collect()) +} + fn partition_work( ancestor_lsn: Lsn, - source_layermap: &LayerManager, -) -> (usize, Vec, Vec) { + source: &LayerManager, +) -> Result<(usize, Vec, Vec), Error> { let mut straddling_branchpoint = vec![]; let mut rest_of_historic = vec![]; let mut later_by_lsn = 0; - for desc in source_layermap.layer_map().iter_historic_layers() { + for desc in source.layer_map()?.iter_historic_layers() { // off by one chances here: // - start is inclusive // - end is exclusive @@ -409,10 +424,10 @@ fn partition_work( &mut rest_of_historic }; - target.push(source_layermap.get_from_desc(&desc)); + target.push(source.get_from_desc(&desc)); } - (later_by_lsn, straddling_branchpoint, rest_of_historic) + Ok((later_by_lsn, straddling_branchpoint, rest_of_historic)) } async fn upload_rewritten_layer( @@ -488,10 +503,12 @@ async fn copy_lsn_prefix( // reuse the key instead of adding more holes between layers by using the real // highest key in the layer. let reused_highest_key = layer.layer_desc().key_range.end; - let copied = writer - .finish(reused_highest_key, target_timeline, ctx) + let (desc, path) = writer + .finish(reused_highest_key, ctx) .await .map_err(CopyDeltaPrefix)?; + let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path) + .map_err(CopyDeltaPrefix)?; tracing::debug!(%layer, %copied, "new layer produced"); @@ -537,11 +554,12 @@ pub(super) async fn complete( tenant: &Tenant, prepared: PreparedTimelineDetach, _ctx: &RequestContext, -) -> Result, anyhow::Error> { +) -> Result, anyhow::Error> { let PreparedTimelineDetach { layers } = prepared; let ancestor = detached - .get_ancestor_timeline() + .ancestor_timeline + .as_ref() .expect("must still have a ancestor"); let ancestor_lsn = detached.get_ancestor_lsn(); @@ -581,7 +599,7 @@ pub(super) async fn complete( } let tl_ancestor = tl.ancestor_timeline.as_ref()?; - let is_same = Arc::ptr_eq(&ancestor, tl_ancestor); + let is_same = Arc::ptr_eq(ancestor, tl_ancestor); let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; let is_deleting = tl @@ -622,13 +640,18 @@ pub(super) async fn complete( }); let reparenting_candidates = tasks.len(); - let mut reparented = Vec::with_capacity(tasks.len()); + let mut reparented = HashSet::with_capacity(tasks.len()); while let Some(res) = tasks.join_next().await { match res { Ok(Some(timeline)) => { tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); - reparented.push((timeline.ancestor_lsn, timeline.timeline_id)); + + assert!( + reparented.insert(timeline.timeline_id), + "duplicate reparenting? timeline_id={}", + timeline.timeline_id + ); } Ok(None) => { // lets just ignore this for now. one or all reparented timelines could had @@ -650,12 +673,5 @@ pub(super) async fn complete( tracing::info!("failed to reparent some candidates"); } - reparented.sort_unstable(); - - let reparented = reparented - .into_iter() - .map(|(_, timeline_id)| timeline_id) - .collect(); - Ok(reparented) } diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index fec66aabc1..07d860eb80 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -213,51 +213,45 @@ impl Timeline { let mut js = tokio::task::JoinSet::new(); { let guard = self.layers.read().await; - let layers = guard.layer_map(); - for layer in layers.iter_historic_layers() { - let layer = guard.get_from_desc(&layer); - // guard against eviction while we inspect it; it might be that eviction_task and - // disk_usage_eviction_task both select the same layers to be evicted, and - // seemingly free up double the space. both succeeding is of no consequence. + guard + .likely_resident_layers() + .filter(|layer| { + let last_activity_ts = layer.latest_activity(); - if !layer.is_likely_resident() { - continue; - } + let no_activity_for = match now.duration_since(last_activity_ts) { + Ok(d) => d, + Err(_e) => { + // We reach here if `now` < `last_activity_ts`, which can legitimately + // happen if there is an access between us getting `now`, and us getting + // the access stats from the layer. + // + // The other reason why it can happen is system clock skew because + // SystemTime::now() is not monotonic, so, even if there is no access + // to the layer after we get `now` at the beginning of this function, + // it could be that `now` < `last_activity_ts`. + // + // To distinguish the cases, we would need to record `Instant`s in the + // access stats (i.e., monotonic timestamps), but then, the timestamps + // values in the access stats would need to be `Instant`'s, and hence + // they would be meaningless outside of the pageserver process. + // At the time of writing, the trade-off is that access stats are more + // valuable than detecting clock skew. + return false; + } + }; - let last_activity_ts = layer.access_stats().latest_activity(); - - let no_activity_for = match now.duration_since(last_activity_ts) { - Ok(d) => d, - Err(_e) => { - // We reach here if `now` < `last_activity_ts`, which can legitimately - // happen if there is an access between us getting `now`, and us getting - // the access stats from the layer. - // - // The other reason why it can happen is system clock skew because - // SystemTime::now() is not monotonic, so, even if there is no access - // to the layer after we get `now` at the beginning of this function, - // it could be that `now` < `last_activity_ts`. - // - // To distinguish the cases, we would need to record `Instant`s in the - // access stats (i.e., monotonic timestamps), but then, the timestamps - // values in the access stats would need to be `Instant`'s, and hence - // they would be meaningless outside of the pageserver process. - // At the time of writing, the trade-off is that access stats are more - // valuable than detecting clock skew. - continue; - } - }; - - if no_activity_for > p.threshold { + no_activity_for > p.threshold + }) + .cloned() + .for_each(|layer| { js.spawn(async move { layer .evict_and_wait(std::time::Duration::from_secs(5)) .await }); stats.candidates += 1; - } - } + }); }; let join_all = async move { diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs new file mode 100644 index 0000000000..e82559b8b3 --- /dev/null +++ b/pageserver/src/tenant/timeline/handle.rs @@ -0,0 +1,967 @@ +//! An efficient way to keep the timeline gate open without preventing +//! timeline shutdown for longer than a single call to a timeline method. +//! +//! # Motivation +//! +//! On a single page service connection, we're typically serving a single TenantTimelineId. +//! +//! Without sharding, there is a single Timeline object to which we dispatch +//! all requests. For example, a getpage request gets dispatched to the +//! Timeline::get method of the Timeline object that represents the +//! (tenant,timeline) of that connection. +//! +//! With sharding, for each request that comes in on the connection, +//! we first have to perform shard routing based on the requested key (=~ page number). +//! The result of shard routing is a Timeline object. +//! We then dispatch the request to that Timeline object. +//! +//! Regardless of whether the tenant is sharded or not, we want to ensure that +//! we hold the Timeline gate open while we're invoking the method on the +//! Timeline object. +//! +//! However, we want to avoid the overhead of entering the gate for every +//! method invocation. +//! +//! Further, for shard routing, we want to avoid calling the tenant manager to +//! resolve the shard for every request. Instead, we want to cache the +//! routing result so we can bypass the tenant manager for all subsequent requests +//! that get routed to that shard. +//! +//! Regardless of how we accomplish the above, it should not +//! prevent the Timeline from shutting down promptly. +//! +//! # Design +//! +//! There are three user-facing data structures: +//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime. +//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime. +//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`. +//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request) +//! +//! The `Handle` is just a wrapper around an `Arc`. +//! +//! There is one long-lived `Arc`, which is stored in the `PerTimelineState`. +//! The `Cache` stores a `Weak` for each cached Timeline. +//! +//! To dispatch a request, the page service connection calls `Cache::get`. +//! +//! A cache miss means we consult the tenant manager for shard routing, +//! resulting in an `Arc`. We enter its gate _once_ and construct an +//! `Arc`. We store a `Weak` in the cache +//! and the `Arc` in the `PerTimelineState`. +//! +//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing +//! and find the `Weak` in the cache. +//! We upgrade the `Weak` to an `Arc` and wrap it in the user-facing `Handle` type. +//! +//! The request handler dispatches the request to the right `>::$request_method`. +//! It then drops the `Handle`, which drops the `Arc`. +//! +//! # Memory Management / How The Reference Cycle Is Broken +//! +//! The attentive reader may have noticed the strong reference cycle +//! from `Arc` to `PerTimelineState` to `Arc`. +//! +//! This cycle is intentional: while it exists, the `Cache` can upgrade its +//! `Weak` to an `Arc` in a single atomic operation. +//! +//! The cycle is broken by either +//! - `PerTimelineState::shutdown` or +//! - dropping the `Cache`. +//! +//! Concurrently existing `Handle`s will extend the existence of the cycle. +//! However, since `Handle`s are short-lived and new `Handle`s are not +//! handed out after either `PerTimelineState::shutdown` or `Cache` drop, +//! that extension of the cycle is bounded. +//! +//! # Fast Path for Shard Routing +//! +//! The `Cache` has a fast path for shard routing to avoid calling into +//! the tenant manager for every request. +//! +//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak`. +//! +//! The current implementation uses the first entry in the hash map +//! to determine the `ShardParameters` and derive the correct +//! `ShardIndex` for the requested key. +//! +//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`. +//! +//! If the lookup is successful and the `Weak` can be upgraded, +//! it's a hit. +//! +//! ## Cache invalidation +//! +//! The insight is that cache invalidation is sufficient and most efficiently done lazily. +//! The only reasons why an entry in the cache can become stale are: +//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is +//! being detached, timeline or shard deleted, or pageserver is shutting down. +//! 2. We're doing a shard split and new traffic should be routed to the child shards. +//! +//! Regarding (1), we will eventually fail to upgrade the `Weak` once the +//! timeline has shut down, and when that happens, we remove the entry from the cache. +//! +//! Regarding (2), the insight is that it is toally fine to keep dispatching requests +//! to the parent shard during a shard split. Eventually, the shard split task will +//! shut down the parent => case (1). + +use std::collections::hash_map; +use std::collections::HashMap; +use std::sync::atomic::AtomicBool; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::Weak; + +use pageserver_api::shard::ShardIdentity; +use tracing::instrument; +use tracing::trace; +use utils::id::TimelineId; +use utils::shard::ShardIndex; +use utils::shard::ShardNumber; + +use crate::tenant::mgr::ShardSelector; + +/// The requirement for Debug is so that #[derive(Debug)] works in some places. +pub(crate) trait Types: Sized + std::fmt::Debug { + type TenantManagerError: Sized + std::fmt::Debug; + type TenantManager: TenantManager + Sized; + type Timeline: ArcTimeline + Sized; +} + +/// Uniquely identifies a [`Cache`] instance over the lifetime of the process. +/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`]. +/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer. +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +struct CacheId(u64); + +impl CacheId { + fn next() -> Self { + static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); + let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + if id == 0 { + panic!("CacheId::new() returned 0, overflow"); + } + Self(id) + } +} + +/// See module-level comment. +pub(crate) struct Cache { + id: CacheId, + map: Map, +} + +type Map = HashMap>>; + +impl Default for Cache { + fn default() -> Self { + Self { + id: CacheId::next(), + map: Default::default(), + } + } +} + +#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)] +pub(crate) struct ShardTimelineId { + pub(crate) shard_index: ShardIndex, + pub(crate) timeline_id: TimelineId, +} + +/// See module-level comment. +pub(crate) struct Handle(Arc>); +struct HandleInner { + shut_down: AtomicBool, + timeline: T::Timeline, + // The timeline's gate held open. + _gate_guard: utils::sync::gate::GateGuard, +} + +/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`. +/// +/// See module-level comment for details. +pub struct PerTimelineState { + // None = shutting down + handles: Mutex>>>>, +} + +impl Default for PerTimelineState { + fn default() -> Self { + Self { + handles: Mutex::new(Some(Default::default())), + } + } +} + +/// Abstract view of [`crate::tenant::mgr`], for testability. +pub(crate) trait TenantManager { + /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`]. + /// Errors are returned as [`GetError::TenantManager`]. + async fn resolve( + &self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> Result; +} + +/// Abstract view of an [`Arc`], for testability. +pub(crate) trait ArcTimeline: Clone { + fn gate(&self) -> &utils::sync::gate::Gate; + fn shard_timeline_id(&self) -> ShardTimelineId; + fn get_shard_identity(&self) -> &ShardIdentity; + fn per_timeline_state(&self) -> &PerTimelineState; +} + +/// Errors returned by [`Cache::get`]. +#[derive(Debug)] +pub(crate) enum GetError { + TenantManager(T::TenantManagerError), + TimelineGateClosed, + PerTimelineStateShutDown, +} + +/// Internal type used in [`Cache::get`]. +enum RoutingResult { + FastPath(Handle), + SlowPath(ShardTimelineId), + NeedConsultTenantManager, +} + +impl Cache { + /// See module-level comment for details. + /// + /// Does NOT check for the shutdown state of [`Types::Timeline`]. + /// Instead, the methods of [`Types::Timeline`] that are invoked through + /// the [`Handle`] are responsible for checking these conditions + /// and if so, return an error that causes the page service to + /// close the connection. + #[instrument(level = "trace", skip_all)] + pub(crate) async fn get( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + // terminates because each iteration removes an element from the map + loop { + let handle = self + .get_impl(timeline_id, shard_selector, tenant_manager) + .await?; + if handle.0.shut_down.load(Ordering::Relaxed) { + let removed = self + .map + .remove(&handle.0.timeline.shard_timeline_id()) + .expect("invariant of get_impl is that the returned handle is in the map"); + assert!( + Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)), + "shard_timeline_id() incorrect?" + ); + } else { + return Ok(handle); + } + } + } + + #[instrument(level = "trace", skip_all)] + async fn get_impl( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + let miss: ShardSelector = { + let routing_state = self.shard_routing(timeline_id, shard_selector); + match routing_state { + RoutingResult::FastPath(handle) => return Ok(handle), + RoutingResult::SlowPath(key) => match self.map.get(&key) { + Some(cached) => match cached.upgrade() { + Some(upgraded) => return Ok(Handle(upgraded)), + None => { + trace!("handle cache stale"); + self.map.remove(&key).unwrap(); + ShardSelector::Known(key.shard_index) + } + }, + None => ShardSelector::Known(key.shard_index), + }, + RoutingResult::NeedConsultTenantManager => shard_selector, + } + }; + self.get_miss(timeline_id, miss, tenant_manager).await + } + + #[inline(always)] + fn shard_routing( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> RoutingResult { + loop { + // terminates because when every iteration we remove an element from the map + let Some((first_key, first_handle)) = self.map.iter().next() else { + return RoutingResult::NeedConsultTenantManager; + }; + let Some(first_handle) = first_handle.upgrade() else { + // TODO: dedup with get() + trace!("handle cache stale"); + let first_key_owned = *first_key; + self.map.remove(&first_key_owned).unwrap(); + continue; + }; + + let first_handle_shard_identity = first_handle.timeline.get_shard_identity(); + let make_shard_index = |shard_num: ShardNumber| ShardIndex { + shard_number: shard_num, + shard_count: first_handle_shard_identity.count, + }; + + let need_idx = match shard_selector { + ShardSelector::Page(key) => { + make_shard_index(first_handle_shard_identity.get_shard_number(&key)) + } + ShardSelector::Zero => make_shard_index(ShardNumber(0)), + ShardSelector::Known(shard_idx) => shard_idx, + }; + let need_shard_timeline_id = ShardTimelineId { + shard_index: need_idx, + timeline_id, + }; + let first_handle_shard_timeline_id = ShardTimelineId { + shard_index: first_handle_shard_identity.shard_index(), + timeline_id: first_handle.timeline.shard_timeline_id().timeline_id, + }; + + if need_shard_timeline_id == first_handle_shard_timeline_id { + return RoutingResult::FastPath(Handle(first_handle)); + } else { + return RoutingResult::SlowPath(need_shard_timeline_id); + } + } + } + + #[instrument(level = "trace", skip_all)] + #[inline(always)] + async fn get_miss( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + match tenant_manager.resolve(timeline_id, shard_selector).await { + Ok(timeline) => { + let key = timeline.shard_timeline_id(); + match &shard_selector { + ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)), + ShardSelector::Page(_) => (), // gotta trust tenant_manager + ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index), + } + + let gate_guard = match timeline.gate().enter() { + Ok(guard) => guard, + Err(_) => { + return Err(GetError::TimelineGateClosed); + } + }; + trace!("creating new HandleInner"); + let handle = Arc::new( + // TODO: global metric that keeps track of the number of live HandlerTimeline instances + // so we can identify reference cycle bugs. + HandleInner { + shut_down: AtomicBool::new(false), + _gate_guard: gate_guard, + timeline: timeline.clone(), + }, + ); + let handle = { + let mut lock_guard = timeline + .per_timeline_state() + .handles + .lock() + .expect("mutex poisoned"); + match &mut *lock_guard { + Some(per_timeline_state) => { + let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle)); + assert!(replaced.is_none(), "some earlier code left a stale handle"); + match self.map.entry(key) { + hash_map::Entry::Occupied(_o) => { + // This cannot not happen because + // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and + // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle + // while we were waiting for the tenant manager. + unreachable!() + } + hash_map::Entry::Vacant(v) => { + v.insert(Arc::downgrade(&handle)); + handle + } + } + } + None => { + return Err(GetError::PerTimelineStateShutDown); + } + } + }; + Ok(Handle(handle)) + } + Err(e) => Err(GetError::TenantManager(e)), + } + } +} + +impl PerTimelineState { + /// After this method returns, [`Cache::get`] will never again return a [`Handle`] + /// to the [`Types::Timeline`] that embeds this per-timeline state. + /// Even if [`TenantManager::resolve`] would still resolve to it. + /// + /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive. + /// That's ok because they're short-lived. See module-level comment for details. + #[instrument(level = "trace", skip_all)] + pub(super) fn shutdown(&self) { + let handles = self + .handles + .lock() + .expect("mutex poisoned") + // NB: this .take() sets locked to None. + // That's what makes future `Cache::get` misses fail. + // Cache hits are taken care of below. + .take(); + let Some(handles) = handles else { + trace!("already shut down"); + return; + }; + for handle in handles.values() { + // Make hits fail. + handle.shut_down.store(true, Ordering::Relaxed); + } + drop(handles); + } +} + +impl std::ops::Deref for Handle { + type Target = T::Timeline; + fn deref(&self) -> &Self::Target { + &self.0.timeline + } +} + +#[cfg(test)] +impl Drop for HandleInner { + fn drop(&mut self) { + trace!("HandleInner dropped"); + } +} + +// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle. +impl Drop for Cache { + fn drop(&mut self) { + for (_, weak) in self.map.drain() { + if let Some(strong) = weak.upgrade() { + // handle is still being kept alive in PerTimelineState + let timeline = strong.timeline.per_timeline_state(); + let mut handles = timeline.handles.lock().expect("mutex poisoned"); + if let Some(handles) = &mut *handles { + let Some(removed) = handles.remove(&self.id) else { + // There could have been a shutdown inbetween us upgrading the weak and locking the mutex. + continue; + }; + assert!(Arc::ptr_eq(&removed, &strong)); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use pageserver_api::{ + key::{rel_block_to_key, Key, DBDIR_KEY}, + models::ShardParameters, + reltag::RelTag, + shard::ShardStripeSize, + }; + use utils::shard::ShardCount; + + use super::*; + + const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX); + + #[derive(Debug)] + struct TestTypes; + impl Types for TestTypes { + type TenantManagerError = anyhow::Error; + type TenantManager = StubManager; + type Timeline = Arc; + } + + struct StubManager { + shards: Vec>, + } + + struct StubTimeline { + gate: utils::sync::gate::Gate, + id: TimelineId, + shard: ShardIdentity, + per_timeline_state: PerTimelineState, + myself: Weak, + } + + impl StubTimeline { + fn getpage(&self) { + // do nothing + } + } + + impl ArcTimeline for Arc { + fn gate(&self) -> &utils::sync::gate::Gate { + &self.gate + } + + fn shard_timeline_id(&self) -> ShardTimelineId { + ShardTimelineId { + shard_index: self.shard.shard_index(), + timeline_id: self.id, + } + } + + fn get_shard_identity(&self) -> &ShardIdentity { + &self.shard + } + + fn per_timeline_state(&self) -> &PerTimelineState { + &self.per_timeline_state + } + } + + impl TenantManager for StubManager { + async fn resolve( + &self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> anyhow::Result> { + for timeline in &self.shards { + if timeline.id == timeline_id { + match &shard_selector { + ShardSelector::Zero if timeline.shard.is_shard_zero() => { + return Ok(Arc::clone(timeline)); + } + ShardSelector::Zero => continue, + ShardSelector::Page(key) if timeline.shard.is_key_local(key) => { + return Ok(Arc::clone(timeline)); + } + ShardSelector::Page(_) => continue, + ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => { + return Ok(Arc::clone(timeline)); + } + ShardSelector::Known(_) => continue, + } + } + } + anyhow::bail!("not found") + } + } + + #[tokio::test(start_paused = true)] + async fn test_timeline_shutdown() { + crate::tenant::harness::setup_logging(); + + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + // + // fill the cache + // + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (2, 1), + "strong: shard0, mgr; weak: myself" + ); + + let handle: Handle<_> = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + let handle_inner_weak = Arc::downgrade(&handle.0); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + assert_eq!( + ( + Weak::strong_count(&handle_inner_weak), + Weak::weak_count(&handle_inner_weak) + ), + (2, 2), + "strong: handle, per_timeline_state, weak: handle_inner_weak, cache" + ); + assert_eq!(cache.map.len(), 1); + + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (3, 1), + "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" + ); + drop(handle); + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (3, 1), + "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" + ); + + // + // demonstrate that Handle holds up gate closure + // but shutdown prevents new handles from being handed out + // + + tokio::select! { + _ = shard0.gate.close() => { + panic!("cache and per-timeline handler state keep cache open"); + } + _ = tokio::time::sleep(FOREVER) => { + // NB: first poll of close() makes it enter closing state + } + } + + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + + // SHUTDOWN + shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown + + assert_eq!( + 1, + Weak::strong_count(&handle_inner_weak), + "through local var handle" + ); + assert_eq!( + cache.map.len(), + 1, + "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after" + ); + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (3, 1), + "strong: handleinner(via handle), shard0, mgr; weak: myself" + ); + + // this handle is perfectly usable + handle.getpage(); + + cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .err() + .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle"); + assert_eq!( + cache.map.len(), + 0, + "first access after shutdown cleans up the Weak's from the cache" + ); + + tokio::select! { + _ = shard0.gate.close() => { + panic!("handle is keeping gate open"); + } + _ = tokio::time::sleep(FOREVER) => { } + } + + drop(handle); + assert_eq!( + 0, + Weak::strong_count(&handle_inner_weak), + "the HandleInner destructor already ran" + ); + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (2, 1), + "strong: shard0, mgr; weak: myself" + ); + + // closing gate succeeds after dropping handle + tokio::select! { + _ = shard0.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("handle is dropped, no other gate holders exist") + } + } + + // map gets cleaned on next lookup + cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .err() + .expect("documented behavior: can't get new handle after shutdown"); + assert_eq!(cache.map.len(), 0); + + // ensure all refs to shard0 are gone and we're not leaking anything + let myself = Weak::clone(&shard0.myself); + drop(shard0); + drop(mgr); + assert_eq!(Weak::strong_count(&myself), 0); + } + + #[tokio::test] + async fn test_multiple_timelines_and_deletion() { + crate::tenant::harness::setup_logging(); + + let timeline_a = TimelineId::generate(); + let timeline_b = TimelineId::generate(); + assert_ne!(timeline_a, timeline_b); + let timeline_a = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_a, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let timeline_b = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_b, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mut mgr = StubManager { + shards: vec![timeline_a.clone(), timeline_b.clone()], + }; + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + cache + .get(timeline_a.id, ShardSelector::Page(key), &mgr) + .await + .expect("we have it"); + cache + .get(timeline_b.id, ShardSelector::Page(key), &mgr) + .await + .expect("we have it"); + assert_eq!(cache.map.len(), 2); + + // delete timeline A + timeline_a.per_timeline_state.shutdown(); + mgr.shards.retain(|t| t.id != timeline_a.id); + assert!( + mgr.resolve(timeline_a.id, ShardSelector::Page(key)) + .await + .is_err(), + "broken StubManager implementation" + ); + + assert_eq!( + cache.map.len(), + 2, + "cache still has a Weak handle to Timeline A" + ); + cache + .get(timeline_a.id, ShardSelector::Page(key), &mgr) + .await + .err() + .expect("documented behavior: can't get new handle after shutdown"); + assert_eq!(cache.map.len(), 1, "next access cleans up the cache"); + + cache + .get(timeline_b.id, ShardSelector::Page(key), &mgr) + .await + .expect("we still have it"); + } + + fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key { + rel_block_to_key( + RelTag { + spcnode: 1663, + dbnode: 208101, + relnode: 2620, + forknum: 0, + }, + shard.0 as u32 * params.stripe_size.0, + ) + } + + #[tokio::test(start_paused = true)] + async fn test_shard_split() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let parent = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let child_params = ShardParameters { + count: ShardCount(2), + stripe_size: ShardStripeSize::default(), + }; + let child0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::from_params(ShardNumber(0), &child_params), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let child1 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::from_params(ShardNumber(1), &child_params), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let child_shards_by_shard_number = [child0.clone(), child1.clone()]; + + let mut cache = Cache::::default(); + + // fill the cache with the parent + for i in 0..2 { + let handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + &StubManager { + shards: vec![parent.clone()], + }, + ) + .await + .expect("we have it"); + assert!( + Weak::ptr_eq(&handle.myself, &parent.myself), + "mgr returns parent first" + ); + drop(handle); + } + + // + // SHARD SPLIT: tenant manager changes, but the cache isn't informed + // + + // while we haven't shut down the parent, the cache will return the cached parent, even + // if the tenant manager returns the child + for i in 0..2 { + let handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + &StubManager { + shards: vec![], // doesn't matter what's in here, the cache is fully loaded + }, + ) + .await + .expect("we have it"); + assert!( + Weak::ptr_eq(&handle.myself, &parent.myself), + "mgr returns parent" + ); + drop(handle); + } + + let parent_handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)), + &StubManager { + shards: vec![parent.clone()], + }, + ) + .await + .expect("we have it"); + assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself)); + + // invalidate the cache + parent.per_timeline_state.shutdown(); + + // the cache will now return the child, even though the parent handle still exists + for i in 0..2 { + let handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + &StubManager { + shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop + }, + ) + .await + .expect("we have it"); + assert!( + Weak::ptr_eq( + &handle.myself, + &child_shards_by_shard_number[i as usize].myself + ), + "mgr returns child" + ); + drop(handle); + } + + // all the while the parent handle kept the parent gate open + tokio::select! { + _ = parent_handle.gate.close() => { + panic!("parent handle is keeping gate open"); + } + _ = tokio::time::sleep(FOREVER) => { } + } + drop(parent_handle); + tokio::select! { + _ = parent.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("parent handle is dropped, no other gate holders exist") + } + } + } + + #[tokio::test(start_paused = true)] + async fn test_connection_handler_exit() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + let key = DBDIR_KEY; + + // Simulate 10 connections that's opened, used, and closed + let mut used_handles = vec![]; + for _ in 0..10 { + let mut cache = Cache::::default(); + let handle = { + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + handle + }; + handle.getpage(); + used_handles.push(Arc::downgrade(&handle.0)); + } + + // No handles exist, thus gates are closed and don't require shutdown + assert!(used_handles + .iter() + .all(|weak| Weak::strong_count(weak) == 0)); + + // ... thus the gate should close immediately, even without shutdown + tokio::select! { + _ = shard0.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("handle is dropped, no other gate holders exist") + } + } + } +} diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 1e4edd34ad..8f20d84401 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,4 +1,4 @@ -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{bail, ensure, Context}; use itertools::Itertools; use pageserver_api::shard::TenantShardId; use std::{collections::HashMap, sync::Arc}; @@ -24,35 +24,142 @@ use crate::{ use super::TimelineWriterState; /// Provides semantic APIs to manipulate the layer map. -#[derive(Default)] -pub(crate) struct LayerManager { - layer_map: LayerMap, - layer_fmgr: LayerFileManager, +pub(crate) enum LayerManager { + /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate + /// the layers. + Open(OpenLayerManager), + /// Shutdown layer manager where there are no more in-memory layers and persistent layers are + /// read-only. + Closed { + layers: HashMap, + }, +} + +impl Default for LayerManager { + fn default() -> Self { + LayerManager::Open(OpenLayerManager::default()) + } } impl LayerManager { + pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer { + // The assumption for the `expect()` is that all code maintains the following invariant: + // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. + self.layers() + .get(key) + .with_context(|| format!("get layer from key: {key}")) + .expect("not found") + .clone() + } + pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer { - self.layer_fmgr.get_from_desc(desc) + self.get_from_key(&desc.key()) } /// Get an immutable reference to the layer map. /// /// We expect users only to be able to get an immutable layer map. If users want to make modifications, /// they should use the below semantic APIs. This design makes us step closer to immutable storage state. - pub(crate) fn layer_map(&self) -> &LayerMap { - &self.layer_map + pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> { + use LayerManager::*; + match self { + Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map), + Closed { .. } => Err(Shutdown), + } } + pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> { + use LayerManager::*; + + match self { + Open(open) => Ok(open), + Closed { .. } => Err(Shutdown), + } + } + + /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in + /// order to allow shutdown to complete. + /// + /// If there was a want to flush in-memory layers, it must have happened earlier. + pub(crate) fn shutdown(&mut self, writer_state: &mut Option) { + use LayerManager::*; + match self { + Open(OpenLayerManager { + layer_map, + layer_fmgr: LayerFileManager(hashmap), + }) => { + let open = layer_map.open_layer.take(); + let frozen = layer_map.frozen_layers.len(); + let taken_writer_state = writer_state.take(); + tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers"); + let layers = std::mem::take(hashmap); + *self = Closed { layers }; + assert_eq!(open.is_some(), taken_writer_state.is_some()); + } + Closed { .. } => { + tracing::debug!("ignoring multiple shutdowns on layer manager") + } + } + } + + /// Sum up the historic layer sizes + pub(crate) fn layer_size_sum(&self) -> u64 { + self.layers() + .values() + .map(|l| l.layer_desc().file_size) + .sum() + } + + pub(crate) fn likely_resident_layers(&self) -> impl Iterator + '_ { + self.layers().values().filter(|l| l.is_likely_resident()) + } + + pub(crate) fn contains(&self, layer: &Layer) -> bool { + self.contains_key(&layer.layer_desc().key()) + } + + pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool { + self.layers().contains_key(key) + } + + pub(crate) fn all_persistent_layers(&self) -> Vec { + self.layers().keys().cloned().collect_vec() + } + + fn layers(&self) -> &HashMap { + use LayerManager::*; + match self { + Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0, + Closed { layers } => layers, + } + } +} + +#[derive(Default)] +pub(crate) struct OpenLayerManager { + layer_map: LayerMap, + layer_fmgr: LayerFileManager, +} + +impl std::fmt::Debug for OpenLayerManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("OpenLayerManager") + .field("layer_count", &self.layer_fmgr.0.len()) + .finish() + } +} + +#[derive(Debug, thiserror::Error)] +#[error("layer manager has been shutdown")] +pub(crate) struct Shutdown; + +impl OpenLayerManager { /// Called from `load_layer_map`. Initialize the layer manager with: /// 1. all on-disk layers /// 2. next open layer (with disk disk_consistent_lsn LSN) - pub(crate) fn initialize_local_layers( - &mut self, - on_disk_layers: Vec, - next_open_layer_at: Lsn, - ) { + pub(crate) fn initialize_local_layers(&mut self, layers: Vec, next_open_layer_at: Lsn) { let mut updates = self.layer_map.batch_update(); - for layer in on_disk_layers { + for layer in layers { Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr); } updates.flush(); @@ -64,26 +171,19 @@ impl LayerManager { self.layer_map.next_open_layer_at = Some(next_open_layer_at); } - /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer, - /// called within `get_layer_for_write`. + /// Open a new writable layer to append data if there is no open layer, otherwise return the + /// current open layer, called within `get_layer_for_write`. pub(crate) async fn get_layer_for_write( &mut self, lsn: Lsn, - last_record_lsn: Lsn, conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, + gate_guard: utils::sync::gate::GateGuard, ctx: &RequestContext, - ) -> Result> { + ) -> anyhow::Result> { ensure!(lsn.is_aligned()); - ensure!( - lsn > last_record_lsn, - "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", - lsn, - last_record_lsn, - ); - // Do we have a layer open for writing already? let layer = if let Some(open_layer) = &self.layer_map.open_layer { if open_layer.get_lsn_range().start > lsn { @@ -109,8 +209,15 @@ impl LayerManager { lsn ); - let new_layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?; + let new_layer = InMemoryLayer::create( + conf, + timeline_id, + tenant_shard_id, + start_lsn, + gate_guard, + ctx, + ) + .await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); @@ -164,7 +271,7 @@ impl LayerManager { froze } - /// Add image layers to the layer map, called from `create_image_layers`. + /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`]. pub(crate) fn track_new_image_layers( &mut self, image_layers: &[ResidentLayer], @@ -237,7 +344,7 @@ impl LayerManager { self.finish_compact_l0(compact_from, compact_to, metrics) } - /// Called when compaction is completed. + /// Called post-compaction when some previous generation image layers were trimmed. pub(crate) fn rewrite_layers( &mut self, rewrite_layers: &[(Layer, ResidentLayer)], @@ -255,13 +362,10 @@ impl LayerManager { new_layer.layer_desc().lsn_range ); - // Transfer visibilty hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to + // Transfer visibility hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents // always marking rewritten layers as visible. - new_layer - .as_ref() - .access_stats() - .set_visibility(old_layer.access_stats().visibility()); + new_layer.as_ref().set_visibility(old_layer.visibility()); // Safety: we may never rewrite the same file in-place. Callers are responsible // for ensuring that they only rewrite layers after something changes the path, @@ -329,31 +433,6 @@ impl LayerManager { mapping.remove(layer); layer.delete_on_drop(); } - - pub(crate) fn likely_resident_layers(&self) -> impl Iterator + '_ { - // for small layer maps, we most likely have all resident, but for larger more are likely - // to be evicted assuming lots of layers correlated with longer lifespan. - - self.layer_map().iter_historic_layers().filter_map(|desc| { - self.layer_fmgr - .0 - .get(&desc.key()) - .filter(|l| l.is_likely_resident()) - .cloned() - }) - } - - pub(crate) fn contains(&self, layer: &Layer) -> bool { - self.layer_fmgr.contains(layer) - } - - pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool { - self.layer_fmgr.contains_key(key) - } - - pub(crate) fn all_persistent_layers(&self) -> Vec { - self.layer_fmgr.0.keys().cloned().collect_vec() - } } pub(crate) struct LayerFileManager(HashMap); @@ -365,20 +444,6 @@ impl Default for LayerFileManager { } impl LayerFileManager { - fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T { - // The assumption for the `expect()` is that all code maintains the following invariant: - // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. - self.0 - .get(&desc.key()) - .with_context(|| format!("get layer from desc: {}", desc.layer_name())) - .expect("not found") - .clone() - } - - fn contains_key(&self, key: &PersistentLayerKey) -> bool { - self.0.contains_key(key) - } - pub(crate) fn insert(&mut self, layer: T) { let present = self.0.insert(layer.layer_desc().key(), layer.clone()); if present.is_some() && cfg!(debug_assertions) { @@ -386,10 +451,6 @@ impl LayerFileManager { } } - pub(crate) fn contains(&self, layer: &T) -> bool { - self.0.contains_key(&layer.layer_desc().key()) - } - pub(crate) fn remove(&mut self, layer: &T) { let present = self.0.remove(&layer.layer_desc().key()); if present.is_none() && cfg!(debug_assertions) { diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index b0d6c4a27a..f4a4eea54a 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -122,6 +122,10 @@ impl CurrentLogicalSize { Self::Exact(_) => Accuracy::Exact, } } + + pub(crate) fn is_exact(&self) -> bool { + matches!(self, Self::Exact(_)) + } } impl LogicalSize { diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index f7440ecdae..592f41cb21 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -130,7 +130,7 @@ pub(super) enum UploadQueueStopped { } #[derive(thiserror::Error, Debug)] -pub(crate) enum NotInitialized { +pub enum NotInitialized { #[error("queue is in state Uninitialized")] Uninitialized, #[error("queue is in state Stopped")] diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 51b0c420c3..27f6fe90a4 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -30,10 +30,12 @@ use tokio::time::Instant; pub use pageserver_api::models::virtual_file as api; pub(crate) mod io_engine; pub use io_engine::feature_test as io_engine_feature_test; +pub use io_engine::io_engine_for_bench; pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; mod metadata; mod open_options; use self::owned_buffers_io::write::OwnedAsyncWriter; +pub(crate) use api::DirectIoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index 2820cea097..0ffcd9fa05 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -328,3 +328,29 @@ pub fn feature_test() -> anyhow::Result { .join() .unwrap() } + +/// For use in benchmark binaries only. +/// +/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also +/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste +/// developer time trying to figure out why it's slow. +/// +/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic. +pub fn io_engine_for_bench() -> IoEngineKind { + #[cfg(not(target_os = "linux"))] + { + panic!("This benchmark does I/O and can only give a representative result on Linux"); + } + #[cfg(target_os = "linux")] + { + match feature_test().unwrap() { + FeatureTestResult::PlatformPreferred(engine) => engine, + FeatureTestResult::Worse { + engine: _engine, + remark, + } => { + panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}"); + } + } + } +} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index dff3a8f52d..804c7fca97 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -618,7 +618,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -685,7 +685,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -752,7 +752,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -920,7 +920,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 5095beefd7..770081b3b4 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -241,6 +241,9 @@ impl PostgresRedoManager { /// Shut down the WAL redo manager. /// + /// Returns `true` if this call was the one that initiated shutdown. + /// `true` may be observed by no caller if the first caller stops polling. + /// /// After this future completes /// - no redo process is running /// - no new redo process will be spawned @@ -250,22 +253,32 @@ impl PostgresRedoManager { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn shutdown(&self) { + pub async fn shutdown(&self) -> bool { // prevent new processes from being spawned - let permit = match self.redo_process.get_or_init_detached().await { + let maybe_permit = match self.redo_process.get_or_init_detached().await { Ok(guard) => { - let (proc, permit) = guard.take_and_deinit(); - drop(proc); // this just drops the Arc, its refcount may not be zero yet - permit + if matches!(&*guard, ProcessOnceCell::ManagerShutDown) { + None + } else { + let (proc, permit) = guard.take_and_deinit(); + drop(proc); // this just drops the Arc, its refcount may not be zero yet + Some(permit) + } } - Err(permit) => permit, + Err(permit) => Some(permit), + }; + let it_was_us = if let Some(permit) = maybe_permit { + self.redo_process + .set(ProcessOnceCell::ManagerShutDown, permit); + true + } else { + false }; - self.redo_process - .set(ProcessOnceCell::ManagerShutDown, permit); // wait for ongoing requests to drain and the refcounts of all Arc that // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s // for the underlying process. self.launched_processes.close().await; + it_was_us } /// This type doesn't have its own background task to check for idleness: we diff --git a/pageserver/test_data/indices/mixed_workload/README.md b/pageserver/test_data/indices/mixed_workload/README.md new file mode 100644 index 0000000000..724274fcd9 --- /dev/null +++ b/pageserver/test_data/indices/mixed_workload/README.md @@ -0,0 +1,7 @@ + +# This was captured from one shard of a large tenant in staging. + +# It has a mixture of deltas and image layers, >1000 layers in total. + +# This is suitable for general smoke tests that want an index which is not +# trivially small, but doesn't contain weird/pathological cases. diff --git a/pageserver/test_data/indices/mixed_workload/index_part.json b/pageserver/test_data/indices/mixed_workload/index_part.json new file mode 100644 index 0000000000..cb4bfc4726 --- /dev/null +++ b/pageserver/test_data/indices/mixed_workload/index_part.json @@ -0,0 +1 @@ +{"version":7,"layer_metadata":{"000000067F00004005000060F300069883DB-000000067F00004005000060F300069D13FA__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A4000-000000067F00004005000060F300039C0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039FC000-000000067F00004005000060F30003A0F066__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000082C0F1-000000067F000040050081DB43000086E169__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000478000-000000067F00004005000060F3000047C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000012C000-000000067F00004005000060F300001F0000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F700019E8000-000000067F00004005000060F700019EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018E0FE6-000000067F00004005000060F3000193A10B__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016E85370000004000-030000000000000000000000000000000002__0000018613F0A050":{"file_size":14172160,"generation":3,"shard":"0008"},"000000067F00004005000060F300034847BD-000000067F00004005000060F300034BD86C__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C80000-000000067F000040050081DB430000C84000__000000BDAFECFC00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000CCBA0-000000067F00004005000060F20100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA4000-000000067F00004005016EA00C0000CE0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00013BC000-000000067F00004005000060FB0001400000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001240000-000000067F00004005016EA00C0001244000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004EC52E9-000000067F00004005000060F30004F1638A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E10000-000000067F000040050081DB430000E14000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000007F0F-000000067F0000400500EB4A480000037E20__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004FE8000-000000067F00004005000060F3000502905D__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000072C000-000000067F000040050081DB430000768000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E3B48F-000000067F00004005000060F30005EF454F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A100000B7E04-030000000000000000000000000000000002__000000E7C2F1B249-000000EBC9213D59":{"file_size":30146560,"generation":2,"shard":"0008"},"000000067F0000400501025D90000009029B-000000067F0000400501025D950100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A10000-000000067F000040050081DB430000A14000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F5105E-000000067F00004005000060F30002F9A0EB__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000187FE22-000000067F000040050081D80C0100000000__00000075E5D2A930":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001E8000-000000067F000040050081DB4300001EC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000184C000-000000067F00004005000060FB000187FE22__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A16504-000000067F00004005000060F30005A57691__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C0000-000000067F00004005000060F100005C821A__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__000001BCB572A4E0":{"file_size":2310144,"generation":17,"shard":"0008"},"000000067F00004005000060F30002214000-000000067F00004005000060F30002264247__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000110000-000000067F0000400500E3A2A10000114000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006864000-000000067F00004005000060F30006868000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D0000-000000067F0000400500DBCED500000D4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274C000-000000067F00004005000060F30002790000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009274AB-030000000000000000000000000000000002__000001935283F9B9-00000196C9018F59":{"file_size":60104704,"generation":11,"shard":"0008"},"000000067F0000400500C782E4000023D359-000000067F0000400500C782E400002A5E4B__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001780DB7-000000067F00004005000060F700017E1391__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E4000-000000067F000040050081DB4300004F8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C0000-000000067F00004005016EA00C00018C4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300056DC000-000000067F00004005000060F300056E0000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001F14230-000000067F000040050081D80C0100000000__0000018613F0A050":{"file_size":59138048,"generation":3,"shard":"0008"},"000000067F00004005010F9F120000004000-030000000000000000000000000000000002__0000012E77D3BF00":{"file_size":105775104,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D80000-000000067F00004005000060F30002D84000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000122BBF-000000067F00004005000060F7000013B18E__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B10000-000000067F00004005000060F30002B88FF2__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006320C60-000000067F00004005000060F30006349DA2__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079E393-000000067F00004005016EA00C00009BF728__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500F67839000005C000-000000067F0000400500F67839000006AEF4__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7F71A-030000000000000000000000000000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":50880512,"generation":17,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C481-000001BCB572C5D9":{"file_size":24576,"generation":20,"shard":"0008"},"000000067F00004005000060F70001570000-000000067F00004005000060F70001574000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000042C000-000000067F00004005000060F30000478000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C5D9-000001BCB572DFF9":{"file_size":24576,"generation":22,"shard":"0008"},"000000067F00004005000060FB00015FCD31-030000000000000000000000000000000002__000000698F2C3A38":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C841ED-000000067F00004005000060F30005C95225__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B4A119-000000067F00004005000060F30100000000__0000008196C976A1-0000008625CF2891":{"file_size":200990720,"generation":2,"shard":"0008"},"000000067F00004005000060F300019790A2-000000067F00004005000060F300019C2056__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001838000-000000067F00004005000060FB000183C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C00FE1-000000067F00004005000060F30001C0A0A3__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E0000-000000067F00004005000060F300056E4000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BBD532-000000067F00004005000060F80100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":96477184,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F9B026-000000067F00004005000060F30100000000__00000047E31D98D1-0000004C49155071":{"file_size":173834240,"generation":2,"shard":"0008"},"000000067F000040050081DB430000500000-000000067F000040050081DB430000504000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004971675-000000067F00004005000060F300049B26A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003102107-000000067F00004005000060F300031130BC__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A4000-000000067F00004005000060F30004900000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004B8000-000000067F00004005016EA00C00004BC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001A71688-000000067F00004005000060FB0001A8A1CD__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E60000-000000067F00004005000060F30000E64000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023B0FF7-000000067F00004005000060F300024020ED__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003F8000-000000067F00004005016EA00C00003FC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004B2B250-000000067F00004005000060F30004B5431C__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000050000-000000067F00004005000060F700000885C5__000000044854EBD1-00000008B6B51879":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000097168A-030000000000000000000000000000000002__00000028C365FBE1-0000002D2A8E0B81":{"file_size":120299520,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625C000-000000067F00004005000060F30006270000__0000017171761D90":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BA8000-000000067F00004005000060FB0001BC0B44__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003344134-000000067F00004005000060F3000336D193__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B10FFF-000000067F00004005000060F30006B22072__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E34000-000000067F00004005000060F30006E70000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008238C-000000067F00004005000060F60100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A30000-000000067F00004005000060F70100000000__0000009DF02C1241-000000A173C00489":{"file_size":269688832,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001CE16ED-000000067F000040050081D80C0100000000__0000008DDCD70B68":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B0000-000000067F000040050081DB4300011B4000__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000010C0D1-000000067F0000400500F3A25C000011E137__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000004000-000000067F00004005000060F70000029ED0__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F60000058F73-000000067F00004005000060F60100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C3F636-000000067F00004005016EA00C0001CC74D7__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000101089-000000067F0000400500EB4A48000012798C__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007A8000-000000067F000040050081DB4300007AC000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000010043F-000000067F00004005000060F20100000000__0000000D55A212C9-000000114A805939":{"file_size":182878208,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EAC000-000000067F00004005000060FB0001F14230__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000616F6B2-000000067F00004005000060F300061B8705__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C9E3C4-000000067F00004005000060F30005CCF3C5__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AA0000-000000067F00004005000060F70001AB05CB__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000073C000-000000067F00004005000060F30000775A02__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AE21D-000000067F000040050081DB43000045029C__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B04000-000000067F00004005000060F70001B18000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E74000-000000067F00004005000060F30000E78000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000182C000-000000067F00004005000060F700018871D6__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DE8B45-000000067F00004005000060FB0000DF968A__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E78000-000000067F00004005000060F30000E7C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000140C000-030000000000000000000000000000000002__000000603CA8F2F0":{"file_size":89522176,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011CA1CD-000000067F00004005000060FB00011F2D11__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144FB4E-000000067F00004005016EA00C00014B79E7__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700015A195C-000000067F00004005000060F80100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC0000-000000067F00004005000060F70000FC4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000012798C-000000067F0000400500EB4A48000013F89B__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE4000-000000067F00004005016EA00C0001D18000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30005FC519A-000000067F00004005000060F30005FE621A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000370000-000000067F00004005016EA00C0000374000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001760000-000000067F00004005016EA00C0001764000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100003A0000-000000067F00004005000060F100003B8214__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B0000-000000067F00004005000060F300006B4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004E1FF6-030000000000000000000000000000000002__000000174479FC18":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F3000502905D-000000067F00004005000060F300050321C0__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB05CB-000000067F00004005000060F70001AB8B97__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000151F7C5-000000067F00004005016EA00C000158F667__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B9C000-000000067F00004005000060F80100000000__000000AFE87558B0":{"file_size":83533824,"generation":2,"shard":"0008"},"000000067F00004005000060F7000141882A-000000067F00004005000060F80100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000018F5CD-000000067F0000400500EB4A48000019F4DD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000196C000-000000067F00004005000060F70001990000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300029C623C-000000067F00004005000060F30100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":81313792,"generation":2,"shard":"0008"},"000000067F00004005000060F300027C0000-000000067F00004005000060F300027C4000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000001487-000000067F0000400500FB3D300100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":24428544,"generation":2,"shard":"0008"},"000000067F00004005000060F300056D8000-000000067F00004005000060F300056DC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C0000-000000067F00004005000060F700003C4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000664E3CA-000000067F00004005000060F30100000000__000001715E483C79-000001751A7D7589":{"file_size":288645120,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004B5AD-000000067F000040050100D04D00000634BB__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000002C000-000000067F0000400500DBCED50000078000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C20000-000000067F00004005016EA00C0000C24000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001B30000-000000067F00004005000060F70001B34000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700009C035C-000000067F00004005000060F80100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":264159232,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B33945-000000067F00004005000060F30100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":155344896,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079FCFA-000000067F00004005016EA00C00007C7B9C__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000218000-000000067F0000400500EB4A48000021C000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1D0DC-000000067F00004005000060F30005D76250__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000149B774-000000067F00004005000060FB00014A42B8__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D0B155-000000067F00004005000060F30003D14206__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300020FC052-000000067F00004005000060F300021050B0__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002268000-000000067F00004005000060F300022B9050__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004FC000-000000067F000040050081DB430000500000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A93B5-000000067F00004005000060F300060C2210__0000016834A3FC91-0000016B49A934C1":{"file_size":263479296,"generation":2,"shard":"0008"},"000000067F00004005000060F3000674C000-000000067F00004005000060F30006798000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007F913A-030000000000000000000000000000000002__000000A5A3F27398":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F4000-030000000000000000000000000000000002__000000E4D847F4E0":{"file_size":103907328,"generation":2,"shard":"0008"},"000000067F00004005000060F70001348000-000000067F00004005000060F70100000000__0000011B632CC319-0000011F1A40FA69":{"file_size":270753792,"generation":2,"shard":"0008"},"000000067F00004005000060F10000030000-000000067F00004005000060F20100000000__000000021DC73119-000000044854EBD1":{"file_size":267771904,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006674000-000000067F00004005000060F30006690000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000298000-000000067F00004005000060F3000029C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F185D4-000000067F00004005000060F80100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":249135104,"generation":2,"shard":"0008"},"000000067F00004005000060F300049CB712-000000067F00004005000060F30004A048A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700004B1E77-000000067F00004005000060F80100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B00000-000000067F00004005000060F30004B1111A__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D14000-000000067F00004005000060F30006D30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00002D77AE-030000000000000000000000000000000002__000001880F984A29-0000018C496B6DB1":{"file_size":81018880,"generation":11,"shard":"0008"},"000000067F00004005000060F300002D0000-000000067F00004005000060F30000370FD1__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000028000-000000067F0000400500D69D79000002C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002170000-000000067F00004005000060F30002174000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F59017-000000067F00004005000060F30000F91FFF__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006A37A-000000067F00004005000060F60100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002F012-000000067F00004005000060F60100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005614000-000000067F00004005000060F30005688000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300036C8000-000000067F00004005000060F300036F91FE__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF63C-030000000000000000000000000000000002__000001B3E1B95181-000001B6FFE46BC9":{"file_size":64421888,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000057D31-000000067F0000400500EB4A48000008FC41__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F58000-000000067F00004005016EA00C0000F5C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000908000-000000067F000040050081DB43000094A076__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000471200E-000000067F00004005000060F3000474302B__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000403DA-030000000000000000000000000000000002__00000075E5D2A930":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F60000079C4E-000000067F00004005000060F60100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F67839000003C000-000000067F0000400500F678390000058000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C80000-000000067F00004005000060FB0001C84000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300047F5138-000000067F00004005000060F3000480620C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B5C09E-000000067F00004005000060F30006BAD108__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410F57-000000067F00004005000060F70001429534__00000122A7BB7B29-0000012694E36301":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B4000-000000067F00004005016EA00C00006E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009605D8-000000067F00004005000060F80100000000__000000923719A971-00000096262826C9":{"file_size":251338752,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C8CD0C-000000067F00004005000060F80100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700012B8000-000000067F00004005000060F80100000000__00000113456156F1-00000117EDA82C11":{"file_size":265781248,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000049C000-000000067F00004005016EA00C00004A8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000C78000-000000067F00004005000060F70000C7C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B4B0BB-000000067F00004005000060F30006B5C09E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001844000-000000067F00004005000060FB0001848000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F0000-000000067F00004005000060F300067F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C80000-000000067F00004005000060F30004C84000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A4C000-000000067F00004005000060F30002A98000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002480000-000000067F00004005000060F30002484000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000306A02D-000000067F00004005000060F30100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":191299584,"generation":2,"shard":"0008"},"000000067F00004005000060F70001510000-000000067F00004005000060F70001514000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BDB15B-000000067F00004005000060F30005C841ED__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E98000-000000067F00004005000060FB0001E9C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300057942F4-000000067F00004005000060F300057DD292__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005698000-000000067F00004005000060F3000569C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002983166-000000067F00004005000060F3000299C28F__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C24000-000000067F00004005016EA00C0000CA0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300033D7D7C-000000067F00004005000060F30003458D42__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A1C000-000000067F000040050081DB430000A30379__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D93639-000000067F00004005000060F50100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C195-000000067F00004005016EA00C000029C196__000001BA93C39481-000001BCB572A4E1":{"file_size":32768,"generation":17,"shard":"0008"},"000000067F00004005000060F30000A5F9BB-000000067F00004005000060F60100000000__000000321AA80270":{"file_size":81657856,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D84000-000000067F00004005000060F30002D93639__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1C000-000000067F00004005000060F30005D70000__000001684518AF20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010C8000-000000067F000040050081DB4300010E2072__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000058AF5E-000000067F000040050081DB4300005BCFD7__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000034611E-000000067F00004005000060F80100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300000C1095-000000067F00004005000060F60100000000__000000021DC73119-000000044854EBD1":{"file_size":220635136,"generation":2,"shard":"0008"},"000000067F00004005000060FB000183C000-000000067F00004005000060FB0001840000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C8729E-000000067F00004005000060F30006C98340__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005138000-000000067F00004005000060F3000513C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053E30C3-000000067F00004005000060F300053F40CC__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000002C000-000000067F000040050081DB4300000403DA__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004970000-000000067F00004005000060F30004974000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C08000-000000067F00004005000060F30003C0C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000103AD12-000000067F00004005000060FB000104B856__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004AC000-000000067F00004005016EA00C00004B8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000DB7D33-000000067F00004005016EA00C0000E47BD2__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001F30000-000000067F00004005000060F30001F34000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C000-030000000000000000000000000000000002__000001180B3FF408":{"file_size":70516736,"generation":2,"shard":"0008"},"000000067F00004005000060F700017405D4-000000067F00004005000060F70001758B92__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300030B0000-000000067F00004005000060F300030C0FE5__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002168000-000000067F00004005000060F3000216C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000046A83-000000067F00004005000060F60100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001368000-000000067F00004005000060FB000136C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000184000-000000067F00004005000060F80100000000__000000174479FC18":{"file_size":93143040,"generation":2,"shard":"0008"},"000000067F00004005000060FB00012A8000-000000067F00004005000060FB0100000000__00000057593D8169-0000005C01565329":{"file_size":273711104,"generation":2,"shard":"0008"},"000000067F00004005000060F700007B0000-000000067F00004005000060F700007D05C8__00000075CC373F31-00000079F2A2F311":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001680B45-000000067F00004005000060FB000169968A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300050CC000-000000067F00004005000060F300050E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__0000018613F0A050":{"file_size":2310144,"generation":3,"shard":"0008"},"000000067F00004005000060F70001B1C000-000000067F00004005000060F70001B30000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F50000-000000067F00004005000060F70000F705D6__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050109CD330100000000-000000067F000040050109FFA2000000C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001FC000-000000067F0000400500EB4A480000200000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000240B12A-000000067F00004005000060F300024440AE__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000008228D-000000067F00004005000060F60100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C000042C000-000000067F00004005016EA00C0000478000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000FF8000-000000067F00004005000060FB0001000B44__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000169968A-000000067F00004005000060FB00016D21CF__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005F821C-000000067F00004005000060F20100000000__000000636DE92159-000000663565F8C9":{"file_size":149954560,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7C000-000000067F00004005016EA00C0001E03DD8__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F0000400500F678390000058000-000000067F0000400500F67839000005C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003A7E20-000000067F0000400500EB4A4800003BFD31__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001228000-000000067F00004005016EA00C000122C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000F0C0E9-000000067F000040050081DB430000F4E15B__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000758000-000000067F00004005000060F80100000000__0000006DDB29D589-000000722F474369":{"file_size":264781824,"generation":2,"shard":"0008"},"000000067F00004005000060F300068640AF-000000067F00004005000060F3000686D0DE__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000047C000-000000067F00004005016EA00C0000498000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006166575-000000067F00004005000060F3000616F6B2__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B18000-000000067F00004005000060F70001B1C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700016EC000-000000067F00004005000060F70001708000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005CCF3C5-000000067F00004005000060F30005D184F6__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002848000-000000067F00004005000060F3000285901B__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039C0000-000000067F00004005000060F300039C4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002464000-000000067F00004005000060F30002480000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D0000-000000067F00004005016EA00C00011D4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003D44283-000000067F00004005000060F30003D952B0__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480100000000-000000067F0000400500EE16BC0000044000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000533205E-000000067F00004005000060F300053E30C3__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000009A255-000000067F00004005000060F60300000000__0000017CC2FD7288":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B00000-000000067F00004005000060F70001B04000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004958000-000000067F00004005000060F3000495C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000518000-000000067F00004005000060F80100000000__0000004C49155071-0000004F31878919":{"file_size":262373376,"generation":2,"shard":"0008"},"000000067F00004005000060F300064D8000-000000067F00004005000060F3000658113F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000014000-000000067F0000400500FDA1F80000020D42__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000284000-000000067F00004005000060FB00002D4B6A__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CDBB9C-000000067F00004005000060F80100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":148865024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001298000-000000067F00004005016EA00C000129C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001DD8000-000000067F00004005000060FB0001DF0B43__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001220000-000000067F00004005000060F70001224000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002908000-000000067F00004005000060F30002920FA0__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F5C000-000000067F00004005016EA00C0000F90000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001E03DD8-030000000000000000000000000000000002__000001BCB572A4E0":{"file_size":139264,"generation":17,"shard":"0008"},"000000067F00004005000060F30003998000-000000067F00004005000060F3000399C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014E75C6-030000000000000000000000000000000002__000001A931C135B1-000001AC25760149":{"file_size":51486720,"generation":11,"shard":"0008"},"000000067F00004005010660F500000F44CB-000000067F00004005010660F70100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003FC000-000000067F00004005016EA00C0000400000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F30003849093__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B00000-000000067F00004005000060F30006B10FFF__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001541688-000000067F00004005000060FB000154A1CD__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001098000-000000067F00004005000060FB000109C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700011912D4-000000067F00004005000060F80100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A40000-000000067F00004005000060F30002A44000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001448000-000000067F00004005000060F300014B0F7B__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001009688-000000067F00004005000060FB000102A1CE__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A4000-000000067F0000400500EE16BC00001E0000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B58B45-000000067F00004005000060FB0000B6168A__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000AC000-000000067F0000400500D69D7900000BDAF5__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000193A10B-000000067F00004005000060F30100000000__00000075CC373F31-00000079F2A2F311":{"file_size":198148096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A0000-000000067F00004005016EA00C00005A4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700000E0000-000000067F00004005000060F80100000000__0000000D80565628":{"file_size":112009216,"generation":2,"shard":"0008"},"000000067F00004005000060F3000690F2FD-000000067F00004005000060F300069883DB__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004C6B83-000000067F00004005000060F60100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E18000-000000067F00004005000060F30001E50FF3__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B4000-000000067F00004005000060F300043B8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100006C0000-000000067F00004005000060F20100000000__000000722F474369-00000075CC373F31":{"file_size":267665408,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A78000-000000067F00004005000060F70000A7C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011C1688-000000067F00004005000060FB00011CA1CD__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004E8000-000000067F00004005016EA00C00004EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000257A6F-000000067F00004005016EA00C000029F90B__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001590000-000000067F00004005000060FB0001594000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193189A-030000000000000000000000000000000002__000001B3F17FE4E0":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F300027C4000-000000067F00004005000060F30002828000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B40000-000000067F00004005016EA00C0000B44000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006694000-000000067F00004005000060F300066F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015C8000-000000067F00004005000060FB00015CC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B84000-000000067F00004005000060F30003B90000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006704000-000000067F00004005000060F30006748000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000793506-030000000000000000000000000000000002__0000002427BD8BD0":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004F1638A-000000067F00004005000060F30100000000__000001440D3D0C69-0000014784964B91":{"file_size":93708288,"generation":2,"shard":"0008"},"000000067F00004005000060F80100000000-000000067F00004005000060FB0000014000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000180000-000000067F00004005000060F70000184000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A2693B-000000067F00004005000060F30004A7F98F__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C71F27-000000067F00004005000060F30002C9AFB8__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300038075AF-000000067F00004005000060F30100000000__000000FF8B261599-000001048B25A8E9":{"file_size":49823744,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000028000-000000067F0000400500DBCED5000002C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004188000-000000067F00004005000060F300041D9101__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30006868000-000000067F00004005000060F50100000000__00000178C5D5D3A8":{"file_size":116645888,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A789A0-000000067F00004005000060F30003AB9907__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000368000-000000067F0000400500EB4A48000036FF11__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300047EC0CA-000000067F00004005000060F300047F5138__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB8B97-000000067F00004005000060F70001AC115C__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D61283-000000067F00004005000060F70000D8985C__000000C462B3C2A9-000000C824C09619":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300011D1111-000000067F00004005000060F3000122A1D5__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001967D34-000000067F00004005016EA00C000197FBD0__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FA2AD3000004D85C-000000067F0000400500FB3D300100000000__0000010D77B487A0":{"file_size":31309824,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005BCFD7-000000067F000040050081DB4300005D704F__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000004000-000000067F00004005000060F100000260F2__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F0000400500EE16BC00000F8000-000000067F0000400500EE16BC000014158C__000000F901689359-000000FCCD5238B1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000921E8A-000000067F00004005000060F60100000000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":228564992,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001190000-000000067F00004005000060FB0001198B44__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A0000-000000067F00004005000060F300067A4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000200000-000000067F00004005000060F10000204000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0FBB-000000067F00004005000060F3000407201D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000001C000-000000067F00004005000060F3000008228D__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CD7376-030000000000000000000000000000000002__000001B6FFE46BC9-000001BA93C39481":{"file_size":70238208,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000EBC000-000000067F00004005000060FB0000EC8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000293210E-000000067F00004005000060F30002983166__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000151F271-000000067F00004005000060F30100000000__000000636DE92159-000000663565F8C9":{"file_size":41271296,"generation":2,"shard":"0008"},"000000067F00004005000060F30004880000-000000067F00004005000060F30004884000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000518222-000000067F00004005000060F20100000000__0000005413AB3641-00000057593D8169":{"file_size":169492480,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E0000-000000067F00004005016EA00C00003E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000775A02-000000067F00004005000060F60100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000197FBD0-000000067F00004005016EA00C00019C7A6A__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000067114B-000000067F00004005000060F60100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":232669184,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408000-000000067F00004005000060FB000140C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001F8000-000000067F0000400500EB4A4800001FC000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000290000-000000067F0000400500EB4A480000294000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003061089-000000067F00004005000060F3000306A02D__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE4000-000000067F00004005000060F30001CF0197__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E20000-000000067F00004005000060F70000E24000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D0000-000000067F000040050081DB4300001D4000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D184F6-000000067F00004005000060F30100000000__0000016143292911-00000164DEE06671":{"file_size":200163328,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F4000-000000067F00004005000060F30006700000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A38000-000000067F000040050081DB430000A4A074__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F38000-000000067F00004005000060F30000F59017__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C0C000-000000067F00004005000060FB0000C18000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D34000-000000067F00004005000060F30006D60000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700013E85D1-000000067F00004005000060F70001410BBC__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000538B44-000000067F00004005000060FB0000551689__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410000-000000067F00004005000060F70001414000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300032F1113-000000067F00004005000060F3000330A1C8__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004974000-000000067F00004005000060F3000498DC49__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625EB45-000000067F00004005000060F30006277C61__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700019E8E81-000000067F00004005000060F80100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":246792192,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5730259-000001BCB5732691":{"file_size":24576,"generation":187,"shard":"0008"},"000000067F000040050081DB4300001CC000-000000067F000040050081DB4300001D0000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C00000-000000067F00004005000060F30002C18FAE__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC4000-000000067F00004005000060F70000FCD85E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000107C39B-030000000000000000000000000000000002__0000004C49155071-0000004F31878919":{"file_size":133349376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F90000-000000067F00004005016EA00C0000F94000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000F98000-000000067F00004005016EA00C0000F9C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700019EC000-000000067F00004005000060F80100000000__0000014EDD256548":{"file_size":7421952,"generation":2,"shard":"0008"},"000000067F00004005000060F300069FA3F6-000000067F00004005000060F30006A0B44C__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AC000-000000067F000040050081DB4300003B27DA__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A57691-000000067F00004005000060F30005B00697__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300060CB2C8-000000067F00004005000060F300060D4415__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000495C000-000000067F00004005000060F30004970000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000D1C5F-000000067F0000400500D69D7900000F1B5B__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001358000-030000000000000000000000000000000002__000001A95031E5B8":{"file_size":21110784,"generation":11,"shard":"0008"},"000000067F00004005000060F3000430C000-000000067F00004005000060F30004370000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004904000-000000067F00004005000060F30004958000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000008000-000000067F00004005000060F30000378000__00000186146441F1-0000018624969469":{"file_size":33357824,"generation":6,"shard":"0008"},"000000067F00004005000060F700005C0000-000000067F00004005000060F700005C85CE__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B04000-000000067F00004005016EA00C0000B40000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002920FA0-000000067F00004005000060F3000293210E__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002058000-000000067F00004005000060F30002070F71__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000686D0DE-000000067F00004005000060F3000689E295__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000004000-000000067F0000400500FA2AD30000030000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009BF728-000000067F00004005016EA00C0000A575C7__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30004374000-000000067F00004005000060F300043B0000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F0000-000000067F00004005000060F300051F4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B22072-000000067F00004005000060F30006B4B0BB__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000328FA4E-000000067F00004005000060F50100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000000FEA0-000000067F00004005016EA00C000001FD3E__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000019F4DD-030000000000000000000000000000000002__000000F6661C9241-000000F901689359":{"file_size":59498496,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003EC000-000000067F00004005016EA00C00003F8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000073C000-000000067F00004005016EA00C000074F43B__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003542BFF-000000067F00004005000060F50100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001771169-000000067F00004005000060F80100000000__000001398B56A519-0000013C9C0E3339":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003B27DA-030000000000000000000000000000000002__0000008DDCD70B68":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000542AFB0-000000067F00004005000060F30005474062__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000057C94F-000000067F00004005000060F80100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300055861F2-000000067F00004005000060F30100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":127393792,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D79136-000000067F00004005000060F30100000000__0000008DBE2855F9-000000923719A971":{"file_size":227958784,"generation":2,"shard":"0008"},"000000067F00004005000060F10000218000-000000067F00004005000060F1000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD4000-000000067F00004005016EA00C0001CE0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300017EC000-000000067F00004005000060F30001886B2A__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001188000-000000067F00004005000060F300011D1111__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000ECC000-000000067F00004005000060FB0000F050F2__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018C0000-000000067F00004005000060F300018E0FE6__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E4000-000000067F00004005016EA00C0000738000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002790000-000000067F00004005000060F30002794000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00001B850B-000000067F0000400500F56D510100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F100001F8000-000000067F00004005000060F100001FC000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000810000-000000067F00004005000060F80100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F00004005000060F100006CBF87-000000067F00004005000060F20100000000__000000A5A3F27398":{"file_size":15851520,"generation":2,"shard":"0008"},"000000067F0000400500F7D2DD0100000000-000000067F0000400500F8E3A50000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010AABC7-000000067F00004005000060F80100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B80000-000000067F00004005000060F30003B84000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000078000-000000067F000040050081DB4300000AA080__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002618000-000000067F00004005000060F30002680F9D__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A48000-000000067F00004005000060F30002A4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001994000-000000067F00004005000060F700019E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6168A-000000067F00004005000060FB0000B6A1D0__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000147A0EC-000000067F00004005000060FB000148AC30__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000060000-000000067F0000400500EE16BC0000064000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003458D42-000000067F00004005000060F30003481DDB__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E30000-000000067F00004005000060F30006E34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017F8000-000000067F00004005000060F700017FC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C50000-000000067F00004005000060F30004C54000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F80100000000__00000139CF156B58":{"file_size":63463424,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A8E15E-000000067F000040050081DB430000A98000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":265404416,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BAE526-000000067F00004005000060F30004BE7584__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF97B-000000067F00004005016EA00C0001B0FD2A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F60000014000-000000067F00004005000060F60100000000__0000003D2AB09B68":{"file_size":83329024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C1C000-000000067F00004005000060FB0000C70000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005240000-000000067F00004005000060F30005244000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000077C000-000000067F000040050081DB430000790000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D60000-000000067F00004005000060F30006D64000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C54000-000000067F00004005000060F30004C60000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30005688000-000000067F00004005000060F3000568C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004370000-000000067F00004005000060F30004374000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F4000-000000067F00004005000060F30005210000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004DDC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001AFD31-000000067F0000400500C782E400001B7C41__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000BB103B-000000067F00004005000060F60000014C3A__0000003579F03331-0000003959DA2DE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D19D030100000000-000000067F0000400500D69D790000024000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000028B253-030000000000000000000000000000000002__0000008196C976A1-0000008625CF2891":{"file_size":151224320,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004E40FFC__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F44EB0100000000-000000067F00004005010F57CB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BCC000-000000067F00004005000060F30003C08000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B80000-000000067F00004005000060F30005B89170__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000135FCAD-000000067F00004005016EA00C000144FB4E__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005010660F500000B0000-000000067F00004005010660F500000B4000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000D31030-000000067F00004005000060F30100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":233791488,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C18FAE-000000067F00004005000060F30002C71F27__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000041FB53-000000067F0000400500EB4A480000447A64__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000048000-000000067F0000400500EE16BC000004C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D0000-000000067F00004005000060FB00009D4000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004365FE-000000067F00004005000060F20100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006BAD108-000000067F00004005000060F30006C0E146__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B4000-000000067F00004005000060F300006E0000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000327C000-000000067F00004005000060F3000328FA4E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B94000-000000067F00004005000060F30003BC8000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CB8FCF-000000067F00004005000060F30003CCA0B9__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003EA902F-000000067F00004005000060F30003F72201__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C64000-000000067F00004005000060F30004C80000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000194000-000000067F000040050081DB4300001C8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__0000018613A0DEA9-00000186146441F1":{"file_size":73728,"generation":5,"shard":"0008"},"000000067F00004005000060F300038B5F5B-000000067F00004005000060F300038FF04F__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001C8000-000000067F000040050081DB4300001CC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000137F10-000000067F0000400500C782E40000177E20__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000139C000-000000067F00004005000060FB00013B8000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000447A64-000000067F0000400500EB4A480100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":40550400,"generation":2,"shard":"0008"},"000000067F00004005000060F70000418000-000000067F00004005000060F700004405CF__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000728000-000000067F000040050081DB43000072C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B0F7B-000000067F00004005000060F30100000000__000000601F43CF09-000000636DE92159":{"file_size":83951616,"generation":2,"shard":"0008"},"000000067F00004005000060F30005F3303F-000000067F00004005000060F30005FA40AD__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300012442A9-000000067F00004005000060F3000129D29A__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010B14AB-000000067F000040050081DB430100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014CF88D-000000067F00004005016EA00C00014D7727__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006A0B44C-000000067F00004005000060F30006A7C566__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000062EE46-000000067F00004005000060F20100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE0000-000000067F00004005016EA00C0001CE4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30000250000-000000067F00004005000060F30000254000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050E8000-000000067F00004005000060F300050EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000259F4A3-000000067F00004005000060F30100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":44433408,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A640EA-000000067F000040050081DB430000A8E15E__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003050000-000000067F00004005000060F30003061089__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C0000158000-000000067F0000400500F3A25C000016A065__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A4000-000000067F000040050081DB4300010B14AB__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E0000-000000067F0000400500EE16BC00001E4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055B8000-000000067F00004005000060F300055BC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE4000-000000067F00004005016EA00C0000D30000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003640000-000000067F00004005000060F30003644000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000014F7AC-000000067F0000400500EB4A4800001876BD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD338E-000000067F00004005016EA00C0001CE79E0__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060FB0001530B44-000000067F00004005000060FB0001541688__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031D516C-000000067F00004005000060F30100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":137863168,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019C7A6A-000000067F00004005016EA00C00019F7907__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000E7F7A7-000000067F00004005016EA00C0000F3F647__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300032C0000-000000067F00004005000060F300032F1113__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E0000-000000067F00004005016EA00C00006E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000019EA78-000000067F00004005000060F80100000000__0000001737D88379-0000001B59EEB909":{"file_size":50946048,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B4FBC9-000000067F00004005016EA00C0001BBFA66__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001660000-000000067F00004005000060FB0001680B45__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002BAA1DD-000000067F00004005000060F30100000000__000000C462B3C2A9-000000C824C09619":{"file_size":203554816,"generation":2,"shard":"0008"},"000000067F00004005000060F300049B26A8-000000067F00004005000060F300049CB712__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CCB5CD-000000067F00004005000060F70000CDBB9C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EEA075-000000067F000040050081DB430000F0C0E9__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E0000-000000067F00004005000060F300003E8FBC__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C9C000-000000067F00004005000060F30006CA0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C7C000-000000067F00004005000060F70000C8CD0C__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001148000-000000067F00004005000060FB000114C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001232ACF-000000067F00004005000060F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FE8000-000000067F00004005000060F700010105DB__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000355928-000000067F0000400500EB4A480100000000__000000FCD84FE628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700003FE341-000000067F00004005000060F80100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000244D189-000000067F00004005000060F30100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":212566016,"generation":2,"shard":"0008"},"000000067F00004005000060F700003B85C7-000000067F00004005000060F80100000000__0000003579F03331-0000003959DA2DE9":{"file_size":208945152,"generation":2,"shard":"0008"},"000000067F00004005000060F100005A2B80-000000067F00004005000060F20100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB000070C000-000000067F00004005000060FB0000718000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB000180C000-000000067F00004005000060FB0001838000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000044000-000000067F0000400500EE16BC0000048000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10100000000-000000067F00004005000060F10300000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":483328,"generation":2,"shard":"0008"},"000000067F00004005000060F30004EA41A5-000000067F00004005000060F30004EC52E9__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AB9907-000000067F00004005000060F30003AF28CB__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000974000-000000067F00004005000060FB00009D0000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038720A2-000000067F00004005000060F300038A3082__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000452BA1-000000067F000040050081DB4300004C4C1E__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017AA0CE-000000067F00004005000060F30100000000__0000006DDB29D589-000000722F474369":{"file_size":202719232,"generation":2,"shard":"0008"},"000000067F000040050081DB430000504000-000000067F000040050081DB430000560000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B5431C-000000067F00004005000060F30004B654F6__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C20000-000000067F00004005000060F30000C24000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300028920E4-000000067F00004005000060F30100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":200351744,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004C4C1E-030000000000000000000000000000000002__000000923719A971-00000096262826C9":{"file_size":192356352,"generation":2,"shard":"0008"},"000000067F000040050081DB430000190000-000000067F000040050081DB430000194000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E88000-000000067F000040050081DB430000E8C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000738000-000000067F00004005016EA00C000073C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000578EE6-000000067F000040050081DB43000058AF5E__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C38000-000000067F00004005000060F30001C3C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B7C0EA-030000000000000000000000000000000002__000000B2B5C4E8F9-000000B768469051":{"file_size":133464064,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625B8F0-000000067F00004005000060F30100000000__0000016B49A934C1-0000016E1FBB7B99":{"file_size":139640832,"generation":2,"shard":"0008"},"000000067F00004005000060FB000109C000-000000067F00004005000060FB0001110000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572DFF9-000001BCB5730259":{"file_size":24576,"generation":41,"shard":"0008"},"000000067F00004005000060FB0000AA8000-000000067F00004005000060FB0000AD0B45__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043F8000-000000067F00004005000060F300043FC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003C7C42-000000067F0000400500EB4A48000041FB53__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BA213F-000000067F00004005000060F30005BDB15B__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300063FE10E-000000067F00004005000060F30100000000__0000016E1FBB7B99-000001715E483C79":{"file_size":111067136,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F91FFF-000000067F00004005000060F30000F9B026__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003650000-000000067F00004005000060F30003654000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050A412B-000000067F00004005000060F300050B5199__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D78000-000000067F00004005016EA00C0001D7C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005016EA00C0001244000-000000067F00004005016EA00C0001298000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100001FC000-000000067F00004005000060F10000200000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA0000-000000067F00004005016EA00C0000CA4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F3000498DC49-000000067F00004005000060F50100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000036EA0-000000067F00004005000060F60100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000928B45-000000067F00004005000060FB000097168A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006854000-000000067F00004005000060F30006858000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C3F5-030000000000000000000000000000000002__00000117EDA82C11-0000011B632CC319":{"file_size":226066432,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A6D1B3-000000067F00004005000060F30100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":117620736,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D2C000-000000067F00004005000060F30002D80000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A31FB6-000000067F00004005000060F30003A3B020__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000160723E-000000067F00004005016EA00C00016570D9__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FB3D310000018000-000000067F0000400500FB3D31000001C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001708000-000000067F00004005000060F7000170C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000283C3E7-000000067F00004005000060F50100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00018F0000-000000067F00004005000060FB0100000000__00000075CC373F31-00000079F2A2F311":{"file_size":268959744,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EC8000-000000067F00004005000060FB0000ECC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F9C000-000000067F00004005016EA00C0000FF0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002680F9D-000000067F00004005000060F3000274A080__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000679C000-000000067F00004005000060F300067A0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000428313F-000000067F00004005000060F300042CC1BD__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00000FFFFFFFF-030000000000000000000000000000000002__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB00017D8000-000000067F00004005000060FB00017DC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017FC000-000000067F00004005000060F70001828000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FD317C-000000067F00004005000060F30002FF427D__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001701588-000000067F00004005000060FB00017120CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500000A3000FFFFFFFF-000000067F0000400500000A690000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":40960,"generation":17,"shard":"0008"},"000000067F00004005000060FB0000638B45-030000000000000000000000000000000002__0000001B59EEB909-0000001FFBC01501":{"file_size":252010496,"generation":2,"shard":"0008"},"000000067F000040050081DB430000394000-000000067F000040050081DB4300003A8000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CF0197-000000067F00004005000060F50100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000DFB51-000000067F0000400500EB4A4800000E7A62__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014C000-000000067F00004005000060F70000180000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005948000-000000067F00004005000060F300059790CD__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000853115-000000067F00004005000060F60100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":176136192,"generation":2,"shard":"0008"},"000000067F00004005000060F30004884000-000000067F00004005000060F30004888000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000513C000-000000067F00004005000060F30005160000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000017C000-000000067F0000400500F3A25C00001B850B__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006382F14-000000067F00004005000060F3000638C06D__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000017F02-000000067F0000400500E3A2A100000B7E04__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001000B44-000000067F00004005000060FB0001009688__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790100000000-000000067F0000400500DBCED50000024000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A0000-000000067F000040050081DB4300010A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000310000-000000067F00004005000060FB0000348B45__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000060038-000000067F00004005000060F60100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE0000-000000067F00004005000060F30001CE4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000AA080-000000067F000040050081DB4300000D40FF__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000551689-030000000000000000000000000000000002__0000001737D88379-0000001B59EEB909":{"file_size":227418112,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000D90000-000000067F00004005000060FB0100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":272769024,"generation":2,"shard":"0008"},"000000067F00004005000060F300059CC403-000000067F00004005000060F300059F53C6__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F2C000-000000067F00004005000060F30001F30000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000014000-000000067F00004005000060FB0000084772__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F30004B654F6-000000067F00004005000060F30004BAE526__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002450000-000000067F00004005000060F30002454000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A0F066-000000067F00004005000060F50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000032EBE-000000067F00004005000060F60100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001D8000-000000067F00004005000060FB00001DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000670000-000000067F00004005016EA00C0000674000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001344000-000000067F00004005016EA00C0001358000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D30000-000000067F00004005016EA00C0000D34000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000012FE9A-000000067F00004005016EA00C00001F7D38__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000BF0000-000000067F00004005000060F70100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":273809408,"generation":2,"shard":"0008"},"000000067F00004005000060F300005A0000-000000067F00004005000060F3000067114B__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000021C000-000000067F0000400500EB4A480000290000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F3C000-000000067F00004005016EA00C0000F58000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000074F43B-030000000000000000000000000000000002__000001936E73D028":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005010F57CB000000C000-000000067F00004005010F99A50100000000__00000126C3C69FC0":{"file_size":22978560,"generation":2,"shard":"0008"},"000000067F00004005000060F700017E1391-000000067F00004005000060F80100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":232677376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CC74D7-000000067F00004005016EA00C0001CD7376__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700005C85CE-000000067F00004005000060F700005E8B9D__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FCD352-000000067F00004005000060F30100000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":124788736,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002A5E4B-000000067F0000400500C782E400002CDD5C__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700018871D6-000000067F00004005000060F80100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D252C8-000000067F00004005000060F30100000000__00000117EDA82C11-0000011B632CC319":{"file_size":205963264,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408A62-000000067F00004005000060FB00014195A7__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001B7C41-000000067F0000400500C782E400001C7B51__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000110000-000000067F00004005000060FB0100000000__000000044854EBD1-00000008B6B51879":{"file_size":272613376,"generation":2,"shard":"0008"},"000000067F00004005000060F300004E8000-000000067F00004005000060F60100000000__0000001737D88379-0000001B59EEB909":{"file_size":260579328,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF4000-000000067F00004005000060F30006E30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C84000-030000000000000000000000000000000002__000000BAC0041E18":{"file_size":59998208,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B88FF2-000000067F00004005000060F30002BAA1DD__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000434000-000000067F00004005000060FB00004A0000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DA8000-000000067F00004005000060F30004DAC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E0000-000000067F000040050081DB4300004E4000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E4000-000000067F0000400500EE16BC0000201716__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C440EA-000000067F000040050081DB430000C5E15B__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000BDAF5-000000067F0000400500D69D790100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A9C000-000000067F00004005000060F30002AEED02__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DAC000-000000067F00004005000060F30004DD8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B94000-000000067F00004005000060F70000B98000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002454000-000000067F00004005000060F30002460000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100001059CB-000000067F00004005000060F10000125BF2__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D362CA-000000067F00004005016EA00C0000DB7D33__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001C0A0A3-000000067F00004005000060F30100000000__0000008625CF2891-00000089F4693119":{"file_size":203063296,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F0000-000000067F00004005000060F300066F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001414000-000000067F00004005000060F70001428000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014CC16D-000000067F00004005000060F300014D5280__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000172AC12-030000000000000000000000000000000002__0000006DDB29D589-000000722F474369":{"file_size":186875904,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E4C000-000000067F000040050081DB430000E88000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300063A50CD-000000067F00004005000060F300063FE10E__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005419E9C-000000067F00004005000060F3000542AFB0__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000014158C-030000000000000000000000000000000002__000000F901689359-000000FCCD5238B1":{"file_size":67854336,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015FF3A0-000000067F00004005016EA00C000160723E__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00008E760F-000000067F00004005016EA00C00009274AB__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B98000-000000067F00004005000060F70000B9C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A4000-000000067F00004005000060FB00004E1FF6__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006670000-000000067F00004005000060F30006674000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000185EE9-000000067F00004005000060F7000018E4B6__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000067CA9-030000000000000000000000000000000002__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":29319168,"generation":2,"shard":"0008"},"000000067F0000400500FF2A51000000BFFB-030000000000000000000000000000000002__0000010D77B487A0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A048A8-000000067F00004005000060F30004A1D870__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004BC000-000000067F00004005000060F300004C6B83__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005290FC9-000000067F00004005000060F3000533205E__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031130BC-000000067F00004005000060F300031C40D1__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000047EE2-000000067F0000400500D19D03000004FDC6__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A44000-000000067F00004005000060F30002A48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DAE2DC-000000067F00004005000060F30003DD734C__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50000014000-000000067F0000400500F8E3A5000004A25C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100002F03E9-000000067F00004005000060F20100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001138000-000000067F00004005000060F80100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":72695808,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E4000-000000067F00004005000060F50100000000__00000159B010F6C0":{"file_size":13393920,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A7C000-000000067F00004005000060F70000ABD9C4__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000CC6E51-030000000000000000000000000000000002__0000003D2AB09B68":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F60000091EFF-000000067F00004005000060F60100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000008FC41-000000067F0000400500EB4A4800000DFB51__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F363B4-000000067F00004005000060F30001F574A6__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD0000-000000067F00004005016EA00C0001CD4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300059B324D-000000067F00004005000060F300059CC403__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002530000-000000067F00004005000060F30002534000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000004B633-000000067F00004005000060F60100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700011E0000-000000067F00004005000060F80100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":262922240,"generation":2,"shard":"0008"},"000000067F00004005000060F30006690000-000000067F00004005000060F30006694000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000100E18-000000067F00004005000060F700001213F2__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000004000-000000067F0000400500FF2A51000000BFFB__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EB8000-000000067F00004005000060FB0000EBC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000674000-000000067F00004005016EA00C00006B0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000EF85D6-000000067F00004005000060F80100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":262897664,"generation":2,"shard":"0008"},"000000067F00004005000060F700005E8B9D-000000067F00004005000060F700005F9158__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E40FFC-000000067F00004005000060F30004E7A062__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000037E20-000000067F0000400500EB4A480000057D31__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400501101C0901FFFFFFFF-030000000000000000000000000000000002__0000012E71CF31F9-000001334140FC21":{"file_size":65060864,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B10000-000000067F00004005000060F70100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":272646144,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E104B-000000067F00004005000060F3000570A19E__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300059790CD-000000067F00004005000060F300059AA115__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B54000-000000067F00004005000060F70000B90000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300041D9101-000000067F00004005000060F3000424A099__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000E085E-000000067F00004005000060F70000100E18__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B0000-000000067F00004005000060F300051B4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572A4E1-000001BCB572C329":{"file_size":24576,"generation":17,"shard":"0008"},"000000067F00004005000060F30006D30000-000000067F00004005000060F30006D34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000020D42-000000067F0000400500FDA1F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081D80C0100000000-000000067F000040050081DB430000024000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000235B4-000000067F00004005000060F60100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500C782E400000A0000-000000067F0000400500C782E400000A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002264247-000000067F00004005000060F50100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000302C2D6-000000067F00004005000060F50100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000129C000-000000067F00004005016EA00C0001340000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700016E8000-000000067F00004005000060F700016EC000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023A0000-000000067F00004005000060F300023B0FF7__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F20100000000-000000067F00004005000060F3000000C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000374000-000000067F00004005016EA00C00003E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000368000-000000067F00004005000060F80100000000__0000003203FB5749-0000003579F03331":{"file_size":263249920,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006310C9-030000000000000000000000000000000002__0000009A1ABDE921-0000009DF02C1241":{"file_size":208953344,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DC8000-000000067F00004005000060FB0000DE8B45__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000530000-000000067F00004005000060FB0000538B44__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000024000-000000067F000040050081DB430000028000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000488C000-000000067F00004005000060F30004898000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300044D3639-000000067F00004005000060F50100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010450640000000570-000000067F0000400501046F39000000BDD2__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300021050B0-000000067F00004005000060F3000212E160__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700010DD440-000000067F00004005000060F80100000000__000000F309FCDD19-000000F6661C9241":{"file_size":91758592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AD0B45-000000067F00004005000060FB0000AE168A__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000013B18E-000000067F00004005000060F7000014B73D__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001938000-000000067F00004005016EA00C000193FE9D__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400000A4000-000000067F0000400500C782E4000012A71E__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001A40000-000000067F00004005000060F30001A44000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008578D4-000000067F00004005016EA00C00008CF772__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001CC0000-000000067F00004005000060F30001CC4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D20000-000000067F00004005000060F30004D24000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E8000-000000067F00004005016EA00C00003EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300039C4000-000000067F00004005000060F300039F8000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005164000-000000067F00004005000060F300051B0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039F8000-000000067F00004005000060F300039FC000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010F46BD-000000067F000040050081DB430100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":113999872,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E630CF-000000067F00004005000060F30100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":171999232,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ACF305-000000067F00004005016EA00C0000ADF1AB__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006748000-000000067F00004005000060F3000674C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F50100000000__00000104BD37F348":{"file_size":11739136,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021C000-000000067F00004005000060F20100000000__0000002427BD8BD0":{"file_size":132448256,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017EC000-000000067F00004005016EA00C00018C0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000025DA3C-000000067F00004005000060F80100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00007F0000-000000067F00004005000060FB0000860B45__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0000-000000067F00004005000060F30003FF4000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E0AD15-000000067F00004005000060FB0000E1B859__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010ADFA80000004000-000000067F00004005010F2BD40100000000__00000126C3C69FC0":{"file_size":13369344,"generation":2,"shard":"0008"},"000000067F00004005000060F30004898000-000000067F00004005000060F3000489C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D2B1B0-000000067F00004005000060F30003D44283__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF4000-000000067F00004005016EA00C0001188000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005010F99A50100000000-000000067F00004005010F9F120000004000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F34000-000000067F00004005000060F30001F38F48__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700018A0000-000000067F00004005000060F700018D85CA__000001440D3D0C69-0000014784964B91":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300029A526C-000000067F00004005000060F300029C623C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017DC000-000000067F00004005000060FB0001808000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000024000-000000067F0000400500DBCED50000028000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000201716-000000067F0000400500EE16C40100000000__0000012A77C1B0B0":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D10000-000000067F00004005000060F30006D14000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430001064000-000000067F000040050081DB4300010A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001340000-000000067F00004005000060F30001344000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003E98000-000000067F00004005000060F30003EA902F__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C0E146-000000067F00004005000060F30006C8729E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000166C4-000000067F00004005000060F60100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":54165504,"generation":2,"shard":"0008"},"000000067F00004005000060F10000180000-000000067F00004005000060F1000018821A__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193FE9D-000000067F00004005016EA00C0001967D34__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB43000076C000-000000067F000040050081DB430000778000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050321C0-000000067F00004005000060F30005063187__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D4000-000000067F0000400500DBCED500000F0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004B8000-000000067F00004005000060F300004BC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000022C000-000000067F00004005000060FB0000280000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DF968A-000000067F00004005000060FB0000E021D0__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000228000-000000067F00004005000060FB000022C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015D8000-000000067F00004005000060FB00015DC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B89170-000000067F00004005000060F30005BA213F__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B0000-000000067F00004005000060F300043B4000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004F8000-000000067F000040050081DB4300004FC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006860000-000000067F00004005000060F30006864000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000ADA0D0-000000067F00004005000060F30000B0300C__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000000000-000000067F000040050100D04D000004369C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000BB439-030000000000000000000000000000000002__00000104BD37F348":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C078FA-000000067F00004005016EA00C0001C0F79A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430000B4A075-000000067F000040050081DB430000B7C0EA__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000117C10C-000000067F00004005000060F50100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E47BD2-000000067F00004005016EA00C0000E67A6E__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30005D23BB5-000000067F00004005000060F50100000000__00000164EA9EC9A8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000336D193-000000067F00004005000060F3000337DCF3__000000E4C63CFA21-000000E7C2F1B249":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F0000-000000067F00004005000060F300001F4000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000084772-030000000000000000000000000000000002__000000027AF9D7D0":{"file_size":147456,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CE79E0-000000067F00004005016EA00C0001D1F87B__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F0000400500EB4A4800FFFFFFFF-000000067F0000400500EB4A480100000000__000000FF8B261599-000001048B25A8E9":{"file_size":1318912,"generation":2,"shard":"0008"},"000000067F00004005000060F70000488000-000000067F00004005000060F7000048C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ADF1AB-000000067F00004005016EA00C0100000000__00000196C9018F59-0000019A2EAFE7A9":{"file_size":282132480,"generation":11,"shard":"0008"},"000000067F00004005000060FB000071C000-000000067F00004005000060FB0000793506__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006850000-000000067F00004005000060F30006854000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000390000-000000067F000040050081DB430000394000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000020C000-000000067F00004005000060F30000250000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001398000-000000067F00004005000060FB000139C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003648000-000000067F00004005000060F3000364C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001C7B51-000000067F0000400500C782E4000023FA62__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001788000-000000067F00004005016EA00C000178C000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000C3A075-000000067F000040050081DB430000C440EA__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036FE561-000000067F00004005000060F300038075AF__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D03000004FDC6-000000067F0000400500D19D030000067CA9__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C00000-000000067F00004005000060FB0000C04000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000282C000-000000067F00004005000060F3000283C3E7__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B0000-000000067F00004005016EA00C00006B4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001789027-000000067F00004005000060F300017AA0CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004558000-000000067F00004005000060F300045C1062__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C08000-000000067F00004005000060FB0000C0C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DCC000-000000067F00004005000060F30006DF0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B221FE-000000067F00004005000060F30004B2B250__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C4000-000000067F00004005016EA00C00018E0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000564000-000000067F000040050081DB430000578000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274A080-000000067F00004005000060F30100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":199057408,"generation":2,"shard":"0008"},"000000067F00004005000060F300046D0EA8-000000067F00004005000060F3000471200E__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001114000-000000067F00004005000060FB0001120000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FEC000-000000067F00004005000060F30003FF0000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000368000-000000067F00004005000060F10100000000__0000003959DA2DE9-0000003D03FCCDB9":{"file_size":269967360,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000012A71E-030000000000000000000000000000000002__000000D037B2DBD0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C98000-000000067F00004005000060F30006C9C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055BC000-000000067F00004005000060F30005610000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F050F2-030000000000000000000000000000000002__00000047F1F2B800":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30002484000-000000067F00004005000060F300024D8000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FE8000-000000067F00004005000060F30003FEC000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000A8000-000000067F0000400500DBCED500000AC000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006C3D76-000000067F00004005000060F80100000000__000000663565F8C9-000000698AF6E809":{"file_size":139821056,"generation":2,"shard":"0008"},"000000067F00004005000060F30002534000-000000067F00004005000060F3000253B7A3__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000412D27C-000000067F00004005000060F30004156457__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000910000-000000067F00004005000060F700009385D4__0000008DBE2855F9-000000923719A971":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30002510000-000000067F00004005000060F30002514000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002210000-000000067F00004005000060F30002214000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF4000-000000067F00004005000060F30004070000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001BBFA66-000000067F00004005016EA00C0001C078FA__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000424A099-000000067F00004005000060F3000428313F__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036F91FE-000000067F00004005000060F30100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":164118528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000718000-000000067F00004005000060FB000071C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010F44EB000000C000-000000067F00004005010F44EB0100000000__00000126C3C69FC0":{"file_size":70696960,"generation":2,"shard":"0008"},"000000067F00004005000060F30005214000-000000067F00004005000060F30005240000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000A7AF6E-030000000000000000000000000000000002__000000321AA80270":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005063187-000000067F00004005000060F300050A412B__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005E8000-000000067F00004005000060F100005F821C__000000636DE92159-000000663565F8C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300020830BE-000000067F00004005000060F300020FC052__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065BB235-000000067F00004005000060F300065F42B4__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000034000-000000067F0000400500FA2AD3000004D85C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017A8000-000000067F00004005016EA00C00017AC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00008D8000-000000067F00004005000060FB0000928B45__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000798000-000000067F00004005000060F300007C1007__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000040000-000000067F0000400500D19D030000047EE2__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AB1583-000000067F00004005000060F50100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AD8000-000000067F00004005000060F30001B09104__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E1B859-030000000000000000000000000000000002__000000417D21ACF9-00000044B4679349":{"file_size":156844032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E9C000-000000067F00004005000060FB0001EA8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001374000-000000067F00004005000060FB0001398000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000155C000-000000067F00004005000060FB0001590000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000EA069-000000067F0000400500F3A25C000010C0D1__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000568C000-000000067F00004005000060F30005698000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C74000-000000067F00004005000060FB0000C98000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004F0000-000000067F00004005000060F80100000000__00000047E31D98D1-0000004C49155071":{"file_size":264921088,"generation":2,"shard":"0008"},"000000067F00004005000060F30005598000-000000067F00004005000060F3000559C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001429534-000000067F00004005000060F80100000000__00000122A7BB7B29-0000012694E36301":{"file_size":231964672,"generation":2,"shard":"0008"},"000000067F00004005000060F70000780000-000000067F00004005000060F80100000000__000000722F474369-00000075CC373F31":{"file_size":263340032,"generation":2,"shard":"0008"},"000000067F00004005000060F300019F31AA-000000067F00004005000060F30100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":168484864,"generation":2,"shard":"0008"},"000000067F000040050081DB430000822079-000000067F000040050081DB43000082C0F1__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007AC000-000000067F000040050081DB4300007F913A__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005847319-000000067F00004005000060F300058C8000__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":261505024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E21687-000000067F00004005000060FB0100000000__000000923719A971-00000096262826C9":{"file_size":224403456,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C98000-000000067F00004005000060F30003CB8FCF__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000045029C-030000000000000000000000000000000002__0000008DBE2855F9-000000923719A971":{"file_size":89505792,"generation":2,"shard":"0008"},"000000067F00004005000060F3000559C000-000000067F00004005000060F300055B8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000285901B-000000067F00004005000060F300028920E4__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E64000-000000067F00004005000060F30000E70000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015FB022-000000067F00004005000060F3000160410C__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FDA081-000000067F00004005000060F30100000000__00000184624E5741-000001860C80A151":{"file_size":202276864,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000107973-000000067F0000400500EE16BC0100000000__000000F309FCDD19-000000F6661C9241":{"file_size":275456000,"generation":2,"shard":"0008"},"000000067F00004005000060F300031C40D1-000000067F00004005000060F300031D516C__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00001F7D38-000000067F00004005016EA00C000020FBCF__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FDA1F80100000000-000000067F0000400500FF2A510000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001182EC9-000000067F00004005000060F80100000000__000000FF8B261599-000001048B25A8E9":{"file_size":174284800,"generation":2,"shard":"0008"},"000000067F00004005000060F700011528FB-000000067F00004005000060F70001182EC9__000000FF8B261599-000001048B25A8E9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300024DC000-000000067F00004005000060F30002510000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00000B0000-030000000000000000000000000000000002__000000021DC73119-000000044854EBD1":{"file_size":259375104,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001DF0B43-000000067F00004005000060FB0001E21687__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000088000-000000067F00004005000060F10000090000__00000008B6B51879-0000000D55A212C9":{"file_size":264142848,"generation":2,"shard":"0008"},"000000067F00004005000060F30003968000-000000067F00004005000060F3000396C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017AC000-000000067F00004005016EA00C00017E8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000019C73D-000000067F00004005000060F20100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":124698624,"generation":2,"shard":"0008"},"000000067F00004005000060F700001F8000-000000067F00004005000060F700002005D2__0000001B59EEB909-0000001FFBC01501":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001110000-000000067F00004005000060FB0001114000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000019842A-000000067F00004005000060F20100000000__0000001737D88379-0000001B59EEB909":{"file_size":145137664,"generation":2,"shard":"0008"},"000000067F00004005000060F700003BC000-000000067F00004005000060F700003C0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000280000-000000067F00004005000060FB0000284000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000007C000-000000067F0000400500DBCED500000A8000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5732691-000001BCB5734CD9":{"file_size":24576,"generation":239,"shard":"0008"},"000000067F00004005010660F70100000000-000000067F000040050107B547000006C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C24000-000000067F00004005000060F30000CA0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000569C000-000000067F00004005000060F300056D8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00000C7A73-030000000000000000000000000000000002__0000018624969469-000001880F984A29":{"file_size":40566784,"generation":11,"shard":"0008"},"000000067F00004005000060F30001344000-000000067F00004005000060F30001358000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F38F48-000000067F00004005000060F50100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001760000-000000067F00004005000060F30001789027__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000018821A-000000067F00004005000060F1000019842A__0000001737D88379-0000001B59EEB909":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300059AA115-000000067F00004005000060F300059B324D__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001400000-000000067F00004005000060FB0001404000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000E7A62-000000067F0000400500EB4A480000107973__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000498000-000000067F00004005000060F3000049C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D24000-000000067F00004005000060F70000D38000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000120E409-000000067F000040050081DB430300000000__0000018613F0A050":{"file_size":24576,"generation":3,"shard":"0008"},"000000067F00004005000060FB0001A8A1CD-000000067F00004005000060FB0100000000__0000007E3A9BFD29-0000008196C976A1":{"file_size":199622656,"generation":2,"shard":"0008"},"000000067F00004005000060F30006270000-000000067F00004005000060F50100000000__0000016E41E03CA0":{"file_size":71114752,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000BAAD15-030000000000000000000000000000000002__0000003579F03331-0000003959DA2DE9":{"file_size":182321152,"generation":2,"shard":"0008"},"000000067F00004005000060F700016205B5-000000067F00004005000060F80100000000__0000012E71CF31F9-000001334140FC21":{"file_size":266862592,"generation":2,"shard":"0008"},"000000067F00004005000060F300030C0FE5-000000067F00004005000060F30003102107__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004BC000-000000067F00004005016EA00C00004E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F10000440000-000000067F00004005000060F1000046821B__00000047E31D98D1-0000004C49155071":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009C8000-000000067F000040050081DB4300009CC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000106C000-000000067F00004005000060F700010AABC7__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000367733F-000000067F00004005000060F50100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000478000-000000067F00004005016EA00C000047C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002E4104A-000000067F00004005000060F30002E4A157__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001370000-000000067F00004005000060FB0001374000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B1111A-000000067F00004005000060F30004B221FE__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C000-000000067F00004005016EA00C00002D0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001C3C000-000000067F00004005000060F30001CC0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000136C000-000000067F00004005000060FB0001370000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000488000-000000067F00004005000060F10100000000__0000004C49155071-0000004F31878919":{"file_size":268754944,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B0300C-000000067F00004005000060F60100000000__0000003203FB5749-0000003579F03331":{"file_size":212885504,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C0F79A-000000067F00004005016EA00C0001C3F636__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000399C000-000000067F00004005000060F300039A0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001574000-000000067F00004005000060F700015A195C__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B00697-000000067F00004005000060F30100000000__0000015DD1D3C809-0000016143292911":{"file_size":282025984,"generation":2,"shard":"0008"},"000000067F00004005000060F300050C8000-000000067F00004005000060F300050CC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000885C5-000000067F00004005000060F80100000000__000000044854EBD1-00000008B6B51879":{"file_size":253878272,"generation":2,"shard":"0008"},"000000067F00004005000060F30001407F7A-000000067F00004005000060F50100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B90000-000000067F00004005000060F70000B94000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000560000-000000067F000040050081DB430000564000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F700017405D4__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043CC000-000000067F00004005000060F300043F8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000129D29A-000000067F00004005000060F30100000000__00000057593D8169-0000005C01565329":{"file_size":110788608,"generation":2,"shard":"0008"},"000000067F00004005000060F300003F9F83-000000067F00004005000060F30000402F4A__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001940000-000000067F00004005000060F700019685CE__0000014784964B91-0000014B000D1821":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B8000-000000067F00004005000060F300043BC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000370FD1-000000067F00004005000060F60100000000__0000000D55A212C9-000000114A805939":{"file_size":232144896,"generation":2,"shard":"0008"},"000000067F00004005000060F30003849093-000000067F00004005000060F300038720A2__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100003C0432-000000067F00004005000060F20100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":262701056,"generation":2,"shard":"0008"},"000000067F00004005000060F700014F85DF-000000067F00004005000060F70001510BBE__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000253B7A3-000000067F00004005000060F50100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001404000-000000067F00004005000060FB0001408000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F942CF-000000067F00004005000060F30003FCD352__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B38000-000000067F00004005000060FB0000B58B45__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B505C8-000000067F00004005000060F80100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":226459648,"generation":2,"shard":"0008"},"000000067F00004005000060F3000612D506-000000067F00004005000060F30006166575__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000DC000-000000067F00004005000060F700000E0000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000000C000-000000067F0000400500FB3D310000018000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C329-000001BCB572C481":{"file_size":24576,"generation":19,"shard":"0008"},"000000067F00004005000060F30002828000-000000067F00004005000060F3000282C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B0000-000000067F00004005000060F300015B4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000078000-000000067F0000400500DBCED5000007C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000086E169-030000000000000000000000000000000002__000000A583FBFB91-000000A9EB8C4489":{"file_size":77471744,"generation":2,"shard":"0008"},"000000067F0000400501046F39000000BDD2-000000067F00004005010660F500000161F7__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3101FFFFFFFF-000000067F0000400500FB3D310300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00000F28ED-030000000000000000000000000000000002__000000F91FE84F08":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E9307A-000000067F00004005000060F30004EA41A5__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016D21CF-030000000000000000000000000000000002__000000698AF6E809-0000006DDB29D589":{"file_size":226353152,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001876BD-000000067F0000400500EB4A48000018F5CD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002E5B84-030000000000000000000000000000000002__000000DBD29DC248":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D8985C-000000067F00004005000060F70000DA1E38__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C28000-000000067F000040050081DB430000C3A075__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000407201D-000000067F00004005000060F300040E319D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002B3CE-000000067F00004005000060F60100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D60000-000000067F00004005000060F80100000000__000000C483D0D6B8":{"file_size":133947392,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F705D6-000000067F00004005000060F80100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":259842048,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E7A062-000000067F00004005000060F30004E9307A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006810000-000000067F00004005000060F30006814000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007D05C8-000000067F00004005000060F80100000000__00000075CC373F31-00000079F2A2F311":{"file_size":251740160,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__0000018624969469-000001880F984A29":{"file_size":40960,"generation":11,"shard":"0008"},"000000067F00004005000060FB00014D8000-000000067F00004005000060FB0001530B44__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EA8000-000000067F00004005000060FB0001EAC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000230A0C7-000000067F00004005000060F30100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":213680128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A98000-000000067F00004005000060F30000AC9024__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F72201-000000067F00004005000060F30003F7B254__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000498000-000000067F00004005016EA00C000049C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CB8000-000000067F00004005000060F30004CBC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042CC1BD-000000067F00004005000060F300042D51D6__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D310000028681-000000067F0000400500FB3D320100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000474302B-000000067F00004005000060F300047EC0CA__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003204000-000000067F00004005000060F30003278000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024020ED-000000067F00004005000060F3000240B12A__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000216C000-000000067F00004005000060F30002170000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000005DD43-000000067F00004005000060F60100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000348B45-000000067F00004005000060FB000037968A__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000778000-000000067F000040050081DB43000077C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B4000-000000067F000040050081DB43000120E409__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CCA0B9-000000067F00004005000060F30003D0B155__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D4000-000000067F00004005000060FB0000A7AF6E__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700008F0000-000000067F00004005000060F80100000000__00000089F4693119-0000008DBE2855F9":{"file_size":262905856,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA0000-000000067F00004005000060F30006CA4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E021D0-000000067F00004005000060FB0000E0AD15__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003654000-000000067F00004005000060F3000367733F__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DC0000-000000067F00004005000060F70000DE05C8__000000C824C09619-000000CC13D2E549":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F700018D85CA-000000067F00004005000060F80100000000__000001440D3D0C69-0000014784964B91":{"file_size":260775936,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EAC000-000000067F00004005000060FB0000EB8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E70000-000000067F00004005000060F30000E74000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FE621A-000000067F00004005000060F30005FFF23F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D20000-000000067F00004005000060F70000D24000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005244000-000000067F00004005000060F3000525C065__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D9001FFFFFFFF-000000067F0000400501025D900300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD4000-000000067F00004005000060F30001CE0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E77906-000000067F00004005016EA00C0000E7F7A7__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300046B41AA-000000067F00004005000060F30100000000__0000012E71CF31F9-000001334140FC21":{"file_size":199688192,"generation":2,"shard":"0008"},"000000067F000040050100D04D00000634BB-030000000000000000000000000000000002__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":173744128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA4000-000000067F00004005000060F30000CB16B6__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DDC000-000000067F00004005000060F30004DF086C__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D7F2DE-000000067F00004005000060F30005DA03A8__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A0000-000000067F00004005000060F300048A4000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100003954D3-000000067F00004005000060F20100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300043BC000-000000067F00004005000060F300043C8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1C000-000000067F00004005016EA00C0001D78000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F100000D8000-000000067F00004005000060F100000E021B__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A0282-000000067F00004005000060F300060A93B5__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021D8F8-000000067F00004005000060F20100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":88227840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000018000-000000067F00004005000060F3000001C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430000E48000-000000067F000040050081DB430000E4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E8FBC-000000067F00004005000060F300003F9F83__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004868000-000000067F00004005000060F3000486C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700013D0000-000000067F00004005000060F700013E85D1__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001203856-030000000000000000000000000000000002__0000005413AB3641-00000057593D8169":{"file_size":157130752,"generation":2,"shard":"0008"},"000000067F00004005000060F3000029C000-000000067F00004005000060F300002C4887__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005160000-000000067F00004005000060F30005164000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000001C000-000000067F0000400500FB3D310000028681__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029F90B-000000067F00004005016EA00C00002D77AE__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30003620000-000000067F00004005000060F30100000000__000000F309FCDD19-000000F6661C9241":{"file_size":249372672,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B90000-000000067F00004005000060F30003B94000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F4000-000000067F00004005000060F30000208000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001BB8000-000000067F00004005000060F30001C00FE1__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005210000-000000067F00004005000060F30005214000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002070F71-000000067F00004005000060F30002079FDE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B40000-000000067F00004005000060F30000BB103B__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000290000-000000067F00004005000060F10000298000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":264134656,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00007C7B9C-000000067F00004005016EA00C0000807A34__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001548000-000000067F00004005000060FB000154C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100005FC000-000000067F00004005000060F1000062EE46__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A0000-000000067F0000400500EE16BC00001A4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F94000-000000067F00004005016EA00C0000F98000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000290000-000000067F00004005000060F80100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":265764864,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BC0B44-000000067F00004005000060FB0001BD1689__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000337DCF2-000000067F00004005000060F30003386D10__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300045C1062-000000067F00004005000060F3000460202F__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006814000-000000067F00004005000060F30006850000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000073DFA8-000000067F00004005016EA00C000079FCFA__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000178C000-000000067F00004005016EA00C00017A8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000051D1AE-000000067F00004005000060F20100000000__00000057593D8169-0000005C01565329":{"file_size":103145472,"generation":2,"shard":"0008"},"000000067F00004005000060F300034BD86C-000000067F00004005000060F30100000000__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":95617024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000008000-000000067F00004005016EA00C000000FEA0__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F1000014C000-000000067F00004005000060F1000015F545__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000000EAB-000000067F0000400500FB3D300100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":12976128,"generation":2,"shard":"0008"},"000000067F000040050081DB430000028000-000000067F000040050081DB43000002C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BD1689-000000067F00004005000060FB0100000000__0000008625CF2891-00000089F4693119":{"file_size":223690752,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000000001__000000FF8B261599-000001048B25A8E9":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D952B0-000000067F00004005000060F30003DAE2DC__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B30000-000000067F00004005000060F70000B505C8__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000549D0A6-000000067F00004005000060F300055861F2__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000046821B-000000067F00004005000060F20100000000__00000047E31D98D1-0000004C49155071":{"file_size":266969088,"generation":2,"shard":"0008"},"000000067F00004005000060F300043C8000-000000067F00004005000060F300043CC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E720A2-000000067F00004005000060F30100000000__000000923719A971-00000096262826C9":{"file_size":141344768,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003A8000-000000067F000040050081DB4300003AC000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006AB7A6-000000067F00004005000060F700006C3D76__000000663565F8C9-000000698AF6E809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000570A19E-000000067F00004005000060F3000573B206__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AF28CB-000000067F00004005000060F30003B33945__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015CC000-000000067F00004005000060FB00015D8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A9CFB-000000067F0000400500D69D7900000D1C5F__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A30000-000000067F00004005000060F30002A34000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000047C000-000000067F00004005000060F30000498000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FFF23F-000000067F00004005000060F300060A0282__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C194-000000067F00004005016EA00C00004EF809__0000018EC67807C9-000001935283F9B9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006D64000-000000067F00004005000060F30006DC8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001340000-000000067F00004005016EA00C0001344000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000BB0000-000000067F00004005016EA00C0000BB4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000007F0F__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000114000-000000067F0000400500E3A2A1000016321A__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000578000-030000000000000000000000000000000002__0000009A24DF6768":{"file_size":107642880,"generation":2,"shard":"0008"},"000000067F00004005000060F30006798000-000000067F00004005000060F3000679C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000E021B-000000067F00004005000060F1000010043F__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000DA8000-030000000000000000000000000000000002__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":233201664,"generation":2,"shard":"0008"},"000000067F00004005000060F100004EC079-000000067F00004005000060F20100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000170C000-000000067F00004005000060F70001720000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FCD85E-000000067F00004005000060F80100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015B74FF-000000067F00004005016EA00C00015FF3A0__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000AC9024-000000067F00004005000060F30000ADA0D0__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16C40100000000-000000067F0000400500F3A25C000006C000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000F1B5B-000000067F0000400500D69D790100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":233275392,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C0C000-000000067F00004005000060F30003C257AD__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E44000-000000067F00004005000060F30000E60000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000018E4B6-000000067F00004005000060F7000019EA78__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017E8000-000000067F00004005016EA00C00017EC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003A4C09C-000000067F00004005000060F30003A6D1B3__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100000260F2-000000067F00004005000060F20100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000097BDA-000000067F00004005016EA00C00000C7A73__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400002CDD5C-030000000000000000000000000000000002__000000D31E48D7C9-000000D74E29AAD1":{"file_size":90923008,"generation":2,"shard":"0008"},"000000067F00004005000060F3000685C000-000000067F00004005000060F30006860000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C84000-000000067F00004005000060FB0001CE16ED__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CC4BC2-000000067F000040050081DB430000CD6C36__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006349DA2-000000067F00004005000060F30006382F14__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000212E160-000000067F00004005000060F30100000000__0000009DF02C1241-000000A173C00489":{"file_size":224731136,"generation":2,"shard":"0008"},"000000067F00004005000060F30001FF8691-000000067F00004005000060F30100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":256114688,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F4000-000000067F00004005000060F30006810000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700015A8000-000000067F00004005000060F700016205B5__0000012E71CF31F9-000001334140FC21":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000024000-000000067F0000400500D69D790000028000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007AE010-000000067F00004005000060F80100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000428000-000000067F00004005016EA00C000042C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001E74000-000000067F00004005000060F30001F28000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038FF04F-000000067F00004005000060F30100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":45359104,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B0FD2A-000000067F00004005016EA00C0001B4FBC9__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006858000-000000067F00004005000060F3000685C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F9A0EB-000000067F00004005000060F30002FD317C__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000808000-000000067F000040050081DB430000822079__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015DC000-000000067F00004005000060FB00015F0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000021C000-000000067F00004005000060F7000025DA3C__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000007C000-000000067F0000400500D69D7900000A8000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001EE3D-000000067F00004005000060F60100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000F4E15B-030000000000000000000000000000000002__000000C462B3C2A9-000000C824C09619":{"file_size":73662464,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F28000-000000067F00004005000060F30001F2C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001F1DA6-030000000000000000000000000000000002__00000081AA3C40F0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70001758B92-000000067F00004005000060F70001771169__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000010000-000000067F0000400500E3A2A10000017F02__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A98000-000000067F00004005000060F30002A9C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000573B206-000000067F00004005000060F300057942F4__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000860B45-030000000000000000000000000000000002__00000023FEF9F321-00000028C365FBE1":{"file_size":252788736,"generation":2,"shard":"0008"},"000000067F00004005000060F7000090B929-000000067F00004005000060F80100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014B73D-000000067F00004005000060F80100000000__000000114A805939-00000013FB921C81":{"file_size":146432000,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D3C000-000000067F00004005000060F70000D60000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001514000-000000067F00004005000060F70001528000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001764000-000000067F00004005016EA00C0001788000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001358000-000000067F00004005000060F3000135C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001594000-000000067F00004005000060FB00015C8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004AC000-000000067F00004005000060F300004B8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005610000-000000067F00004005000060F30005614000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002794000-000000067F00004005000060F300027C0000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C60000-000000067F00004005000060F30004C64000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003A0000-000000067F00004005000060F700003B85C7__0000003579F03331-0000003959DA2DE9":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F1034-030000000000000000000000000000000002__000000E4C63CFA21-000000E7C2F1B249":{"file_size":247480320,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B4000-000000067F00004005000060F300051F0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000003C77D-000000067F00004005000060F60100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010660F500000161F7-030000000000000000000000000000000002__0000010FB1BE19B9-00000113456156F1":{"file_size":64757760,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F7B254-000000067F00004005000060F30003F942CF__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004900000-000000067F00004005000060F30004904000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006F1C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A21037-000000067F00004005000060F30003A31FB6__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000DB0000-000000067F00004005000060F30000E40F86__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A60B43-000000067F00004005000060FB0001A71688__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DC8000-000000067F00004005000060F30006DCC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006E38F6-000000067F00004005000060F80100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122B1C9-000000067F00004005000060F300012442A9__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EA8000-000000067F00004005000060FB0000EAC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B5A072-000000067F00004005000060F80100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144DCA3-000000067F00004005016EA00C000151F7C5__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F600000711FF-000000067F00004005000060F60100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050EC000-000000067F00004005000060F30005138000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005260000-000000067F00004005000060F30005290FC9__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700012DE407-000000067F00004005000060F80100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F10000-000000067F00004005000060F70000F185D4__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D38000-000000067F00004005000060F70000D3C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006671F-000000067F00004005000060F60100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300059F53C6-000000067F00004005000060F30005A16504__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B08000-000000067F000040050081DB430000B4A075__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000152C000-000000067F00004005000060F70001570000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000128000-000000067F00004005000060F3000012C000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F70000E24000-000000067F00004005000060F70000E387D6__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002791D8-000000067F000040050081DB43000028B253__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000500F7-000000067F00004005000060F60100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000ABD9C4-000000067F00004005000060F80100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009CC000-000000067F000040050081DB430000A10000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700002005D2-000000067F00004005000060F80100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":261169152,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AA656E-000000067F000040050081D80C0100000000__00000081AA3C40F0":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E14000-000000067F000040050081DB430000E48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DD734C-000000067F00004005000060F30003E40000__0000011B632CC319-0000011F1A40FA69":{"file_size":261046272,"generation":2,"shard":"0008"},"000000067F0000400500D19D0300FFFFFFFF-030000000000000000000000000000000002__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":5373952,"generation":2,"shard":"0008"},"000000067F00004005000060F30001588000-000000067F00004005000060F3000158C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000AC000-000000067F0000400500DBCED500000D0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000013F89B-000000067F0000400500EB4A48000014F7AC__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005D704F-000000067F000040050081DB4300006310C9__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A14000-000000067F000040050081DB430000A18000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F574A6-000000067F00004005000060F30001FF8691__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D320100000000-000000067F0000400500FDA1F80000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B09104-000000067F00004005000060F30001B4A119__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005011035750100000000-030000000000000000000000000000000002__00000159B010F6C0":{"file_size":78626816,"generation":2,"shard":"0008"},"000000067F00004005000060F1000015F545-000000067F00004005000060F20100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000638C06D-000000067F00004005000060F300063A50CD__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000299C28F-000000067F00004005000060F300029A526C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000364C000-000000067F00004005000060F30003650000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE0000-000000067F00004005016EA00C0000CE4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000794000-000000067F000040050081DB4300007A8000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A18000-000000067F000040050081DB430000A1C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000000C000-000000067F00004005000060F30000018000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB4300000D40FF-030000000000000000000000000000000002__00000075CC373F31-00000079F2A2F311":{"file_size":78061568,"generation":2,"shard":"0008"},"000000067F00004005000060F60000099FD8-000000067F00004005000060F60100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000330A1C8-000000067F00004005000060F3000332B1B6__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FA900D-000000067F00004005000060F30006FDA081__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000148AC30-000000067F00004005000060FB000149B774__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000EF1FC3-000000067F00004005000060F50100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006A7C566-000000067F00004005000060F30100000000__00000178B8B10551-0000017C9F5597E1":{"file_size":173072384,"generation":2,"shard":"0008"},"000000067F00004005000060FB000104B856-000000067F00004005000060FB000107C39B__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000030000-000000067F00004005000060F80100000000__000000021DC73119-000000044854EBD1":{"file_size":261341184,"generation":2,"shard":"0008"},"000000067F00004005000060F30003580FD3-000000067F00004005000060F30100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":228188160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001224000-000000067F00004005000060F70001232ACF__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300022B9050-000000067F00004005000060F3000230A0C7__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006654000-000000067F00004005000060F30006670000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D0000-000000067F00004005000060F700010D85CF__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000FD8000-030000000000000000000000000000000002__000000C824C09619-000000CC13D2E549":{"file_size":237559808,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F0000-000000067F00004005000060FB00015F4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60100000000-000000067F00004005000060F70000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000DA1E38-000000067F00004005000060F80100000000__000000C462B3C2A9-000000C824C09619":{"file_size":209821696,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D76250-000000067F00004005000060F30005D7F2DE__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000418000-000000067F00004005000060F10100000000__00000044B4679349-00000047E31D98D1":{"file_size":269148160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B61000-000000067F00004005000060F80100000000__0000018613F0A050":{"file_size":65150976,"generation":3,"shard":"0008"},"000000067F00004005000060F300008C8000-000000067F00004005000060F300008E0F49__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002D8000-030000000000000000000000000000000002__0000008625CF2891-00000089F4693119":{"file_size":231907328,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C04000-000000067F00004005000060FB0000C08000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001808000-000000067F00004005000060FB000180C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A30379-030000000000000000000000000000000002__000000AFE87558B0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D85CF-000000067F00004005000060F80100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":164970496,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C70000-000000067F00004005000060FB0000C74000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001188000-000000067F00004005016EA00C000118C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000CB85B3-000000067F00004005000060F70000CC8B74__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A1D870-000000067F00004005000060F30004A2693B__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008CF772-000000067F00004005016EA00C00008E760F__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D34000-000000067F00004005016EA00C0000D5D1E9__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014B79E7-000000067F00004005016EA00C00014CF88D__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300040E319D-000000067F00004005000060F300040F41F4__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FF427D-000000067F00004005000060F30100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":156073984,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E0A466-000000067F00004005000060F30005E3B48F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700005F9158-000000067F00004005000060F80100000000__00000057593D8169-0000005C01565329":{"file_size":230768640,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E4000-000000067F00004005016EA00C000193189A__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30005F0202C-000000067F00004005000060F30005F3303F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000148000-000000067F00004005000060F1000014C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C0000-000000067F00004005000060F300060C4000__0000016E41E03CA0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C9C000-000000067F00004005000060FB0000CC6E51__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54700000A0EB1-000000067F000040050109CD330100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004EC000-000000067F00004005016EA00C00005A0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000A9F465-000000067F00004005016EA00C0000ACF305__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000208000-000000067F00004005000060F3000020C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000011E137-000000067F0000400500F67839000003E09B__000001048B25A8E9-0000010779A7F551":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30000402F4A-000000067F00004005000060F60100000000__000000114A805939-00000013FB921C81":{"file_size":166469632,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004A8000-000000067F00004005016EA00C00004AC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001968000-000000067F00004005000060F7000196C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EF8000-000000067F00004005000060F30006EFC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000BB4000-000000067F00004005016EA00C0000C20000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009C0000-000000067F00004005000060F80100000000__0000009A24DF6768":{"file_size":37371904,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C84000-000000067F00004005000060F30004CB8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002514000-000000067F00004005000060F30002530000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DE05C8-000000067F00004005000060F80100000000__000000C824C09619-000000CC13D2E549":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F301FFFFFFFF-000000067F00004005000060F30300000000__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30001886B2A-000000067F00004005000060F50100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700006A8000-000000067F00004005000060F80100000000__000000636DE92159-000000663565F8C9":{"file_size":117022720,"generation":2,"shard":"0008"},"000000067F00004005000060FB000154C000-000000067F00004005000060FB0001558000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053F40CC-000000067F00004005000060F30100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":223453184,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C95225-000000067F00004005000060F30005C9E3C4__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000558C000-000000067F00004005000060F30005598000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FFA699-000000067F00004005000060F50100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F1C000-000000067F00004005000060F50100000000__000001848D082B20":{"file_size":24117248,"generation":2,"shard":"0008"},"000000067F00004005000060F3000486C000-000000067F00004005000060F30004878000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300019C2056-000000067F00004005000060F300019F31AA__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000004C000-000000067F0000400500EE16BC0000060000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000046EAB9-000000067F00004005000060F80100000000__000000417D21ACF9-00000044B4679349":{"file_size":48717824,"generation":2,"shard":"0008"},"000000067F000040050081DB430000790000-000000067F000040050081DB430000794000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000002C000-000000067F0000400500D69D790000078000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000026C90-000000067F00004005000060F60100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000738000-000000067F00004005000060F3000073C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000204000-000000067F00004005000060F10000218000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000177E20-000000067F0000400500C782E400001AFD31__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000048C000-000000067F00004005000060F700004B1E77__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015F8000-000000067F00004005000060F50100000000__000000698F2C3A38":{"file_size":131276800,"generation":2,"shard":"0008"},"000000067F00004005000060F30000428000-000000067F00004005000060F3000042C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000038C000-000000067F000040050081DB430000390000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000102A1CE-000000067F00004005000060FB000103AD12__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001848000-000000067F00004005000060FB000184C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001DC000-000000067F00004005000060FB0000228000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D4000-000000067F00004005016EA00C0001228000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000011775B-030000000000000000000000000000000002__0000018820A34650":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F700011B8000-000000067F00004005000060F80100000000__000001048B25A8E9-0000010779A7F551":{"file_size":263897088,"generation":2,"shard":"0008"},"000000067F00004005000060F3000660D31F-000000067F00004005000060F3000664E3CA__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000064000-000000067F0000400500EE16BC00000F28ED__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000525C065-000000067F00004005000060F50100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A7F98F-000000067F00004005000060F30100000000__000001398B56A519-0000013C9C0E3339":{"file_size":47595520,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004369C-000000067F000040050100D04D000004B5AD__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001A6E2-000000067F00004005000060F60100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700004405CF-000000067F00004005000060F80100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":198836224,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D28000-000000067F00004005000060F30002D2C000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F56D510100000000-000000067F0000400500F67839000003C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E387D6-000000067F00004005000060F80100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000213C000-000000067F00004005000060F30002168000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060D4415-000000067F00004005000060F3000612D506__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3100000546CB-000000067F0000400500FB3D320100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000D18CA9-030000000000000000000000000000000002__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":210288640,"generation":2,"shard":"0008"},"000000067F00004005000060F60000062E4F-000000067F00004005000060F60100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000016A065-000000067F0000400500F3A25C000017C0CB__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AD0000-000000067F00004005000060FB0001B28B44__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000254000-000000067F00004005000060F30000298000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E8C000-000000067F000040050081DB430000EA0000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300040F41F4-000000067F00004005000060F3000412D27C__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00013B8000-000000067F00004005000060FB00013BC000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000D8000-000000067F00004005000060F700000DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000958000-000000067F00004005000060F700009605D8__000000923719A971-00000096262826C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A0000-000000067F00004005000060FB00004A4000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700001213F2-000000067F00004005000060F80100000000__0000000D55A212C9-000000114A805939":{"file_size":55320576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004156457-000000067F00004005000060F30100000000__00000122A7BB7B29-0000012694E36301":{"file_size":96927744,"generation":2,"shard":"0008"},"000000067F00004005000060F30003278000-000000067F00004005000060F3000327C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000158F667-000000067F00004005016EA00C00015B74FF__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001D50000-000000067F00004005000060FB0001D88B43__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000054AE8-000000067F00004005000060F60100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300002C4887-000000067F00004005000060F60100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B34000-000000067F00004005000060F70001B5A072__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000416A8-000000067F00004005000060F60100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F10000050000-000000067F00004005000060F10000058000__000000044854EBD1-00000008B6B51879":{"file_size":264011776,"generation":2,"shard":"0008"},"000000067F00004005000060F300043FC000-000000067F00004005000060F300044D3639__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004878000-000000067F00004005000060F3000487C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000396C000-000000067F00004005000060F30003998000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019F7907-000000067F00004005016EA00C0001A477A4__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268443648,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014D7727-000000067F00004005016EA00C00014E75C6__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00016570D9-030000000000000000000000000000000002__000001AC25760149-000001AFC313C819":{"file_size":86335488,"generation":11,"shard":"0008"},"000000067F00004005000060F70001270000-000000067F00004005000060F80100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":265363456,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003BFD31-000000067F0000400500EB4A4800003C7C42__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B31F8-000000067F00004005000060F300014CC16D__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D5D1E9-030000000000000000000000000000000002__0000019E7001E460":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F100003B8214-000000067F00004005000060F100003C0432__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001346854-000000067F00004005016EA00C000135FCAD__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000160410C-000000067F00004005000060F3000165515A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000118B12B-030000000000000000000000000000000002__00000054161C34B8":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF0000-000000067F00004005000060F30006DF4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C4000-000000067F00004005000060F700003FE341__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000FF0000-000000067F00004005000060F30100000000__0000004C49155071-0000004F31878919":{"file_size":256286720,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F4000-000000067F00004005000060FB00015FCD31__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005816253-000000067F00004005000060F30005847319__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002460000-000000067F00004005000060F30002464000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000113A337-000000067F00004005000060F700011528FB__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000037968A-030000000000000000000000000000000002__0000000D55A212C9-000000114A805939":{"file_size":226426880,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000128000-000000067F00004005016EA00C000012FE9A__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000036FF11-000000067F0000400500EB4A4800003A7E20__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000658113F-000000067F00004005000060F3000659A203__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D18000-000000067F00004005016EA00C0001D1C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001A44000-000000067F00004005000060F30001AB1583__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000138000-000000067F00004005000060F1000013C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300009BC000-000000067F00004005000060F30000A50000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000110E30C-000000067F00004005000060F80100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F50100000000-000000067F00004005000060F60000014000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006FA900D__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001D88B43-000000067F00004005000060FB0100000000__0000008DBE2855F9-000000923719A971":{"file_size":249028608,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122A1D5-000000067F00004005000060F30100000000__0000005413AB3641-00000057593D8169":{"file_size":48783360,"generation":2,"shard":"0008"},"000000067F00004005000060F30006277C61-000000067F00004005000060F30006320C60__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000388000-000000067F000040050081DB43000038C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E67A6E-000000067F00004005016EA00C0000E77906__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300009B8000-000000067F00004005000060F300009BC000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D900000068000-000000067F00004005010450640000000570__0000010FB1BE19B9-00000113456156F1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00002D4B6A-030000000000000000000000000000000002__0000000D80565628":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E50FF3-000000067F00004005000060F30001E720A2__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A4000-000000067F00004005016EA00C0000670000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000C18000-000000067F00004005000060FB0000C1C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BA4F5B-000000067F00004005000060F70000BBD532__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AC115C-000000067F00004005000060F80100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":237248512,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D24000-000000067F00004005000060F30004DA8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA4000-000000067F00004005000060F30006D10000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001433D0-030000000000000000000000000000000002__000000FCCD5238B1-000000FF8B261599":{"file_size":146407424,"generation":2,"shard":"0008"},"000000067F00004005000060F3000165515A-000000067F00004005000060F30100000000__000000698AF6E809-0000006DDB29D589":{"file_size":112680960,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000118C000-000000067F00004005016EA00C00011D0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB43000094A076-030000000000000000000000000000000002__000000A9EB8C4489-000000ACA44C8E99":{"file_size":176054272,"generation":2,"shard":"0008"},"000000067F00004005000060F70001528000-000000067F00004005000060F7000152C000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C82B50-000000067F000040050081DB430000CC4BC2__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EF15A-000000067F000040050081DB4300002791D8__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000125BF2-000000067F00004005000060F20100000000__000000114A805939-00000013FB921C81":{"file_size":78782464,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E40F86-000000067F00004005000060F30100000000__000000417D21ACF9-00000044B4679349":{"file_size":111108096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF0000-000000067F00004005016EA00C0000FF4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000CB16B6-000000067F00004005000060F50100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001990000-000000067F00004005000060F70001994000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A54000-000000067F00004005000060F30000A5F9BB__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300061B8705-000000067F00004005000060F300061D9774__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000084C000-000000067F00004005000060F70000858000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000848000-000000067F00004005000060F7000084C000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D18000-000000067F00004005000060F30001D79136__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001558000-000000067F00004005000060FB000155C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024440AE-000000067F00004005000060F3000244D189__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002CFC020-000000067F00004005000060F30100000000__000000C824C09619-000000CC13D2E549":{"file_size":150708224,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A4A074-000000067F000040050081DB430000A640EA__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C98000-000000067F00004005000060FB0000C9C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001840000-000000067F00004005000060FB0001844000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000802123-000000067F00004005000060F30000853115__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000029ED0-000000067F00004005000060F80100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C00003E4000-000000067F00004005016EA00C00003E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CBC000-000000067F00004005000060F30004D20000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000122C000-000000067F00004005016EA00C0001240000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004DF086C-000000067F00004005000060F50100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050B5199-000000067F00004005000060F30100000000__0000014784964B91-0000014B000D1821":{"file_size":126124032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001A477A4-000000067F00004005016EA00C0001ADF63C__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70001828000-000000067F00004005000060F7000182C000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004F0000-000000067F00004005000060F10000518222__0000005413AB3641-00000057593D8169":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EFD576-000000067F00004005000060F30100000000__00000164DEE06671-0000016834A3FC91":{"file_size":193077248,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50100000000-000000067F0000400500FA2AD30000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000258E3A9-000000067F00004005000060F3000259F4A3__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C90000-000000067F00004005000060F70000CB85B3__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000114C000-000000067F00004005000060FB000118B12B__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003644000-000000067F00004005000060F30003648000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A50000-000000067F00004005000060FB0001A60B43__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C257AD-000000067F00004005000060F50100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002DE8000-000000067F00004005000060F30002E4104A__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000C8000-000000067F0000400500F3A25C00000EA069__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002174000-000000067F00004005000060F30002210000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014D5280-000000067F00004005000060F300014E6333__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000332B1B6-000000067F00004005000060F30003344134__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065F42B4-000000067F00004005000060F3000660D31F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E264A-000000067F000040050081DB4300010F46BD__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300069D13FA-000000067F00004005000060F300069FA3F6__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300061D9774-000000067F00004005000060F30006222843__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C821A-000000067F00004005000060F20100000000__000000601F43CF09-000000636DE92159":{"file_size":265183232,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000200000-000000067F0000400500EB4A480000204000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001690000-000000067F00004005000060F70100000000__000001334140FC21-00000137115BE4D9":{"file_size":273965056,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000A575C7-000000067F00004005016EA00C0000A9F465__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001E6C000-000000067F00004005000060FB0001E98000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014195A7-000000067F00004005000060FB000147A0EC__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AE168A-030000000000000000000000000000000002__0000003203FB5749-0000003579F03331":{"file_size":223379456,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA0000-000000067F00004005000060F30000CA4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E4000-000000067F00004005000060F30000738000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E0000-000000067F00004005000060F300006E4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001124000-000000067F00004005000060FB0001148000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A8000-000000067F0000400500D69D7900000AC000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000130000-000000067F0000400500C782E40000137F10__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000020FBCF-000000067F00004005016EA00C0000257A6F__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001B28B44-000000067F00004005000060FB0100000000__0000008196C976A1-0000008625CF2891":{"file_size":249454592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001120000-000000067F00004005000060FB0001124000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005474062-000000067F00004005000060F3000549D0A6__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000023FA62-030000000000000000000000000000000002__000000D01F399709-000000D31E48D7C9":{"file_size":245366784,"generation":2,"shard":"0008"},"000000067F000040050081DB430000160484-030000000000000000000000000000000002__00000079F2A2F311-0000007E3A9BFD29":{"file_size":226582528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A4FB4-000000067F00004005000060F300038B5F5B__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017E8000-000000067F00004005000060F300017EC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D31000000C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010105DB-000000067F00004005000060F80100000000__000000E4C63CFA21-000000E7C2F1B249":{"file_size":254935040,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858570-000000067F00004005000060F80100000000__0000008196C976A1-0000008625CF2891":{"file_size":252985344,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D4000-000000067F000040050081DB4300001E8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00005E0000-000000067F00004005000060FB0000638B45__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050107B547000006C000-000000067F000040050107B54700000A0EB1__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000430000-000000067F00004005000060FB0000434000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014E6333-000000067F00004005000060F3000151F271__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D300300000000__00000117EDA82C11-0000011B632CC319":{"file_size":65536,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BE7584-000000067F00004005000060F30100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":58204160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001068000-000000067F00004005000060F80100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":168730624,"generation":2,"shard":"0008"},"000000067F00004005000060F1000013C000-000000067F00004005000060F10000148000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000659A203-000000067F00004005000060F300065BB235__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000EC0000-000000067F00004005000060F70000EF85D6__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005010660F500000B4000-000000067F00004005010660F500000F44CB__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A4000-000000067F00004005000060F300067F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F0000-000000067F0000400500DBCED500000F4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000768000-000000067F000040050081DB43000076C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E0000-000000067F00004005016EA00C00018E4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000A50000-000000067F00004005000060F30000A54000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E68000-000000067F00004005000060FB0001E6C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001960000-000000067F00004005000060F300019790A2__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6A1D0-000000067F00004005000060FB0000BAAD15__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E4A157-000000067F00004005000060F30002E630CF__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E70000-000000067F00004005000060F30006E74000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004464DD-000000067F00004005000060F7000046EAB9__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000204000-000000067F0000400500EB4A480000218000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042D51D6-000000067F00004005000060F3000430E1E9__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F30000-000000067F00004005000060FB0100000000__00000047E31D98D1-0000004C49155071":{"file_size":272302080,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006F8000-030000000000000000000000000000000002__0000009DF02C1241-000000A173C00489":{"file_size":235110400,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EC000-000000067F000040050081DB4300001F1DA6__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A3082-000000067F00004005000060F30100000000__000001048B25A8E9-0000010779A7F551":{"file_size":76644352,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000400000-000000067F00004005016EA00C0000404000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003481DDB-000000067F00004005000060F30100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":107814912,"generation":2,"shard":"0008"},"000000067F00004005000060F3000489C000-000000067F00004005000060F300048A0000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CD6C36-000000067F000040050081DB430000D18CA9__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004888000-000000067F00004005000060F3000488C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300008E0F49-000000067F00004005000060F30000921E8A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000074000-000000067F0000400500C782E400000A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011F2D11-000000067F00004005000060FB0001203856__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300046330B1-000000067F00004005000060F300046B41AA__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003548000-000000067F00004005000060F30003580FD3__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001198B44-000000067F00004005000060FB00011C1688__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000049C000-000000067F00004005000060F300004A8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B44000-000000067F00004005016EA00C0000BB0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700014F0000-000000067F00004005000060F700014F85DF__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C5E15B-000000067F000040050081DB430000C801D1__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A10000-000000067F00004005000060F30003A21037__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EFC000-000000067F00004005000060F30006F18000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1F87B-000000067F00004005016EA00C0001D7F71A__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060F30002A34000-000000067F00004005000060F30002A40000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F0AA88-000000067F00004005000060F80100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006700000-000000067F00004005000060F30006704000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CC4000-000000067F00004005000060F30001CD0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858000-000000067F00004005000060F80100000000__00000081AA3C40F0":{"file_size":48439296,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000D6407-000000067F000040050081DB430000160484__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300057DD292-000000067F00004005000060F30005816253__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006222843-000000067F00004005000060F3000625B8F0__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000410000-000000067F00004005000060FB0000430B46__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100006A8000-000000067F00004005000060F100006B0000__0000006DDB29D589-000000722F474369":{"file_size":264110080,"generation":2,"shard":"0008"},"000000067F00004005000060F3000460202F-000000067F00004005000060F300046330B1__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E74000-000000067F00004005000060F30006EF8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A3B020-000000067F00004005000060F30003A4C09C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002535462-000000067F00004005000060F3000258E3A9__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000294000-000000067F0000400500EB4A480000355928__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016E85370000000000-030000000000000000000000000000000002__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":152190976,"generation":2,"shard":"0008"},"000000067F00004005000060F3000158C000-000000067F00004005000060F300015B0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003386D10-000000067F00004005000060F300033D7D7C__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E7C000-000000067F00004005000060F30000EF1FC3__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000030000-000000067F0000400500FA2AD30000034000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005588000-000000067F00004005000060F3000558C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A0000-000000067F00004005000060F300039A4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008A13D-000000067F00004005000060F60100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017120CE-000000067F00004005000060FB000172AC12__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003200000-000000067F00004005000060F30003204000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300007C1007-000000067F00004005000060F30000802123__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000006C000-000000067F0000400500F3A25C00000BB439__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B4000-000000067F00004005000060F300015F8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C220F-000000067F00004005000060F300060CB2C8__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A5000004A25C-000000067F0000400500F8E3A50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C9AFB8-000000067F00004005000060F30002CFC020__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F2BD40100000000-000000067F00004005010F44EB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002AEED02-000000067F00004005000060F50100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002EB8000-000000067F00004005000060F30002F5105E__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A1000016321A-030000000000000000000000000000000002__000000EFDE07FFD8":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000135C000-000000067F00004005000060F30001407F7A__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F67839000006AEF4-000000067F0000400500F7D2DD0100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DA03A8-000000067F00004005000060F30005DC93F1__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E2072-000000067F000040050081DB430100000000__000000D01F399709-000000D31E48D7C9":{"file_size":15392768,"generation":2,"shard":"0008"},"000000067F00004005000060F300004A8000-000000067F00004005000060F300004AC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016E0A44-000000067F00004005000060FB0001701588__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300024D8000-000000067F00004005000060F300024DC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BC8000-000000067F00004005000060F30003BCC000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F00100000000-000000067F00004005000060F10000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430100000000-000000067F0000400500C782E40000074000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D14206-000000067F00004005000060F30003D252C8__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700006479E7-000000067F00004005000060F80100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B9C988-000000067F00004005000060F70000BA4F5B__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000078000-000000067F0000400500D69D79000007C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CC8B74-000000067F00004005000060F80100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":95657984,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000708000-000000067F00004005000060FB000070C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-000000067F000040050081DB430000EEA075__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000001FD3E-000000067F00004005016EA00C0000097BDA__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000689E295-000000067F00004005000060F3000690F2FD__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CE0000-000000067F00004005000060F30000D31030__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-030000000000000000000000000000000002__000000C483D0D6B8":{"file_size":20307968,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000807A34-000000067F00004005016EA00C00008578D4__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430001060000-000000067F000040050081DB430001064000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480F32C-000000067F00004005000060F3000486837F__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700009385D4-000000067F00004005000060F80100000000__0000008DBE2855F9-000000923719A971":{"file_size":252207104,"generation":2,"shard":"0008"},"000000067F00004005000060F30000090000-000000067F00004005000060F300000C1095__000000021DC73119-000000044854EBD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480620C-000000067F00004005000060F3000480F32C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FA40AD-000000067F00004005000060F30005FC519A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014A42B8-030000000000000000000000000000000002__000000601F43CF09-000000636DE92159":{"file_size":137322496,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD0000-000000067F00004005000060F30001CD4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000404000-000000067F00004005016EA00C0000428000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002079FDE-000000067F00004005000060F300020830BE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000487C000-000000067F00004005000060F30004880000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010A188401FFFFFFFF-000000067F00004005010A18840300000000__00000137115BE4D9-000001398B56A519":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000218000-000000067F00004005000060F7000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EF454F-000000067F00004005000060F30005EFD576__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DC93F1-000000067F00004005000060F30005E0A466__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"}},"disk_consistent_lsn":"1BC/B5734CD8","metadata_bytes":{"disk_consistent_lsn":"1BC/B5734CD8","prev_record_lsn":"1BC/B5734CB0","ancestor_timeline":null,"ancestor_lsn":"0/0","latest_gc_cutoff_lsn":"1BC/B5732690","initdb_lsn":"0/14EE150","pg_version":16},"lineage":{}} diff --git a/poetry.lock b/poetry.lock index 5192a574cc..9026824558 100644 --- a/poetry.lock +++ b/poetry.lock @@ -870,6 +870,96 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "clickhouse-connect" +version = "0.7.17" +description = "ClickHouse Database Core Driver for Python, Pandas, and Superset" +optional = false +python-versions = "~=3.8" +files = [ + {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"}, +] + +[package.dependencies] +certifi = "*" +lz4 = "*" +pytz = "*" +urllib3 = ">=1.26" +zstandard = "*" + +[package.extras] +arrow = ["pyarrow"] +numpy = ["numpy"] +orjson = ["orjson"] +pandas = ["pandas"] +sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"] +tzlocal = ["tzlocal (>=4.0)"] + [[package]] name = "colorama" version = "0.4.5" @@ -1424,6 +1514,20 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "kafka-python" +version = "2.0.2" +description = "Pure Python client for Apache Kafka" +optional = false +python-versions = "*" +files = [ + {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"}, + {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"}, +] + +[package.extras] +crc32c = ["crc32c"] + [[package]] name = "lazy-object-proxy" version = "1.10.0" @@ -1470,6 +1574,56 @@ files = [ {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"}, ] +[[package]] +name = "lz4" +version = "4.3.3" +description = "LZ4 Bindings for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"}, + {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"}, + {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"}, + {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"}, + {file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"}, + {file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"}, + {file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"}, + {file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"}, + {file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"}, + {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"}, + {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"}, + {file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"}, + {file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"}, + {file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"}, + {file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"}, + {file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"}, + {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"}, + {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"}, + {file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"}, + {file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"}, + {file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"}, + {file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"}, + {file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"}, + {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"}, + {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"}, + {file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"}, + {file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"}, + {file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"}, + {file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"}, + {file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"}, + {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"}, + {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"}, + {file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"}, + {file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"}, + {file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"}, + {file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"}, +] + +[package.extras] +docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"] +flake8 = ["flake8"] +tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"] + [[package]] name = "markupsafe" version = "2.1.1" @@ -2361,6 +2515,17 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "pytz" +version = "2024.1" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, + {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, +] + [[package]] name = "pywin32" version = "301" @@ -3206,4 +3371,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0" +content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 2f18b5fbc6..b316c53034 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true +try-lock.workspace = true typed-json.workspace = true url.workspace = true urlencoding.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 67c4dd019e..90dea01bf3 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -218,7 +218,7 @@ impl RateBucketInfo { impl AuthenticationConfig { pub fn check_rate_limit( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, config: &AuthenticationConfig, secret: AuthSecret, endpoint: &EndpointId, @@ -243,7 +243,7 @@ impl AuthenticationConfig { let limit_not_exceeded = self.rate_limiter.check( ( endpoint_int, - MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet), + MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet), ), password_weight, ); @@ -274,7 +274,7 @@ impl AuthenticationConfig { /// /// All authentication flows will emit an AuthenticationOk message if successful. async fn auth_quirks( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, api: &impl console::Api, user_info: ComputeUserInfoMaybeEndpoint, client: &mut stream::PqStream>, @@ -303,8 +303,8 @@ async fn auth_quirks( let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; // check allowed list - if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr)); + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); } if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { @@ -356,7 +356,7 @@ async fn auth_quirks( } async fn authenticate_with_secret( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, secret: AuthSecret, info: ComputeUserInfo, client: &mut stream::PqStream>, @@ -421,7 +421,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub async fn authenticate( self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, @@ -467,7 +467,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { impl BackendType<'_, ComputeUserInfo, &()> { pub async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result { use BackendType::*; match self { @@ -478,7 +478,7 @@ impl BackendType<'_, ComputeUserInfo, &()> { pub async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { use BackendType::*; match self { @@ -492,7 +492,7 @@ impl BackendType<'_, ComputeUserInfo, &()> { impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> { async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result { use BackendType::*; @@ -514,7 +514,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> { impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> { async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result { use BackendType::*; @@ -571,7 +571,7 @@ mod tests { impl console::Api for Auth { async fn get_role_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &super::ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) @@ -579,7 +579,7 @@ mod tests { async fn get_allowed_ips_and_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &super::ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError> { @@ -591,7 +591,7 @@ mod tests { async fn wake_compute( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &super::ComputeUserInfo, ) -> Result { unimplemented!() @@ -665,7 +665,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -723,7 +723,7 @@ mod tests { )); let _creds = auth_quirks( - &mut ctx, + &ctx, &api, user_info, &mut stream, @@ -742,7 +742,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -775,7 +775,7 @@ mod tests { )); let _creds = auth_quirks( - &mut ctx, + &ctx, &api, user_info, &mut stream, @@ -794,7 +794,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -828,7 +828,7 @@ mod tests { )); let creds = auth_quirks( - &mut ctx, + &ctx, &api, user_info, &mut stream, diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index b98fa63120..285fa29428 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -12,7 +12,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; pub(super) async fn authenticate( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, creds: ComputeUserInfo, client: &mut PqStream>, config: &'static AuthenticationConfig, @@ -27,7 +27,7 @@ pub(super) async fn authenticate( } AuthSecret::Scram(secret) => { info!("auth endpoint chooses SCRAM"); - let scram = auth::Scram(&secret, &mut *ctx); + let scram = auth::Scram(&secret, ctx); let auth_outcome = tokio::time::timeout( config.scram_protocol_timeout, diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 6b0f5e1726..56921dd949 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -18,7 +18,7 @@ use tracing::{info, warn}; /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. pub async fn authenticate_cleartext( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, info: ComputeUserInfo, client: &mut stream::PqStream>, secret: AuthSecret, @@ -28,7 +28,7 @@ pub async fn authenticate_cleartext( ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let ep = EndpointIdInt::from(&info.endpoint); @@ -60,7 +60,7 @@ pub async fn authenticate_cleartext( /// Similar to [`authenticate_cleartext`], but there's a specific password format, /// and passwords are not yet validated (we don't know how to validate them!) pub async fn password_hack_no_authentication( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, ) -> auth::Result { @@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication( ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let payload = AuthFlow::new(client) .begin(auth::PasswordHack) diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 5932e1337c..95f4614736 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String { } pub(super) async fn authenticate( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index d06f5614f1..8f4a392131 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -84,7 +84,7 @@ pub fn endpoint_sni( impl ComputeUserInfoMaybeEndpoint { pub fn parse( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, params: &StartupMessageParams, sni: Option<&str>, common_names: Option<&HashSet>, @@ -249,8 +249,8 @@ mod tests { fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -264,8 +264,8 @@ mod tests { ("database", "world"), // should be ignored ("foo", "bar"), // should be ignored ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -279,9 +279,9 @@ mod tests { let sni = Some("foo.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("foo")); assert_eq!(user_info.options.get_cache_key("foo"), "foo"); @@ -296,8 +296,8 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -311,8 +311,8 @@ mod tests { ("options", "-ckey=1 endpoint=bar -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -329,8 +329,8 @@ mod tests { ), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -344,8 +344,8 @@ mod tests { ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -359,9 +359,9 @@ mod tests { let sni = Some("baz.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("baz")); @@ -374,16 +374,16 @@ mod tests { let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.a.com"); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); Ok(()) @@ -397,10 +397,9 @@ mod tests { let sni = Some("second.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); - let err = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); + let ctx = RequestMonitoring::test(); + let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -417,10 +416,9 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); - let mut ctx = RequestMonitoring::test(); - let err = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); + let ctx = RequestMonitoring::test(); + let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { UnknownCommonName { cn } => { assert_eq!(cn, "localhost"); @@ -438,9 +436,9 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("project")); assert_eq!( user_info.options.get_cache_key("project"), diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 59d1ac17f4..acf7b4f6b6 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -27,7 +27,7 @@ pub trait AuthMethod { pub struct Begin; /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. -pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring); +pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring); impl AuthMethod for Scram<'_> { #[inline(always)] @@ -155,7 +155,7 @@ impl AuthFlow<'_, S, Scram<'_>> { let Scram(secret, ctx) = self.state; // pause the timer while we communicate with the client - let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; @@ -168,10 +168,8 @@ impl AuthFlow<'_, S, Scram<'_>> { } match sasl.method { - SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256), - SCRAM_SHA_256_PLUS => { - ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus) - } + SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256), + SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus), _ => {} } info!("client chooses {}", sasl.method); diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index d7a3eb9a4d..1038fa5116 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -205,7 +205,7 @@ async fn task_main( const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; async fn ssl_handshake( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, raw_stream: S, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -256,13 +256,13 @@ async fn ssl_handshake( } async fn handle_client( - mut ctx: RequestMonitoring, + ctx: RequestMonitoring, dest_suffix: Arc, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { - let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?; + let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?; // Cut off first part of the SNI domain // We receive required destination details in the format of diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 696940728a..b44e0ddd2f 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -5,6 +5,7 @@ use aws_config::meta::region::RegionProviderChain; use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; +use aws_config::Region; use futures::future::Either; use proxy::auth; use proxy::auth::backend::AuthRateLimiter; @@ -290,9 +291,10 @@ async fn main() -> anyhow::Result<()> { let config = build_config(&args)?; info!("Authentication backend: {}", config.auth_backend); - info!("Using region: {}", config.aws_region); + info!("Using region: {}", args.aws_region); - let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed + let region_provider = + RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone())); let provider_conf = ProviderConfig::without_region().with_region(region_provider.region().await); let aws_credentials_provider = { @@ -318,7 +320,7 @@ async fn main() -> anyhow::Result<()> { }; let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new( elasticache::AWSIRSAConfig::new( - config.aws_region.clone(), + args.aws_region.clone(), args.redis_cluster_name, args.redis_user_id, ), @@ -376,11 +378,14 @@ async fn main() -> anyhow::Result<()> { let cancel_map = CancelMap::default(); - let redis_publisher = match &redis_notifications_client { + let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); + RateBucketInfo::validate(redis_rps_limit)?; + + let redis_publisher = match ®ional_redis_client { Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( redis_publisher.clone(), args.region.clone(), - &config.redis_rps_limit, + redis_rps_limit, )?))), None => None, }; @@ -656,7 +661,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { )?; let http_config = HttpConfig { - request_timeout: args.sql_over_http.sql_over_http_timeout, pool_options: GlobalConnPoolOptions { max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, @@ -676,9 +680,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, }; - let mut redis_rps_limit = args.redis_rps_limit.clone(); - RateBucketInfo::validate(&mut redis_rps_limit)?; - let config = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, @@ -687,11 +688,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { http_config, authentication_config, require_client_ip: args.require_client_ip, - disable_ip_check_for_http: args.disable_ip_check_for_http, - redis_rps_limit, handshake_timeout: args.handshake_timeout, region: args.region.clone(), - aws_region: args.aws_region.clone(), wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, connect_compute_locks, connect_to_compute_retry_config: config::RetryConfig::parse( diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 4bc10a6020..8c851790c2 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -68,7 +68,7 @@ impl EndpointsCache { ready: AtomicBool::new(false), } } - pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool { + pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool { if !self.ready.load(Ordering::Acquire) { return true; } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index de04733eb9..18c82fe379 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -276,12 +276,12 @@ impl ConnCfg { /// Connect to a corresponding compute node. pub async fn connect( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, allow_self_signed_compute: bool, aux: MetricsAuxInfo, timeout: Duration, ) -> Result { - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (socket_addr, stream, host) = self.connect_raw(timeout).await?; drop(pause); @@ -304,14 +304,14 @@ impl ConnCfg { )?; // connect_raw() will not use TLS if sslmode is "disable" - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = self.0.connect_raw(stream, tls).await?; drop(pause); tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); let stream = connection.stream.into_inner(); info!( - cold_start_info = ctx.cold_start_info.as_str(), + cold_start_info = ctx.cold_start_info().as_str(), "connected to compute node at {host} ({socket_addr}) sslmode={:?}", self.0.get_ssl_mode() ); @@ -330,7 +330,7 @@ impl ConnCfg { params, cancel_closure, aux, - _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol), + _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), }; Ok(connection) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 6504919760..1412095505 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -31,11 +31,8 @@ pub struct ProxyConfig { pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, - pub disable_ip_check_for_http: bool, - pub redis_rps_limit: Vec, pub region: String, pub handshake_timeout: Duration, - pub aws_region: String, pub wake_compute_retry_config: RetryConfig, pub connect_compute_locks: ApiLocks, pub connect_to_compute_retry_config: RetryConfig, @@ -55,7 +52,6 @@ pub struct TlsConfig { } pub struct HttpConfig { - pub request_timeout: tokio::time::Duration, pub pool_options: GlobalConnPoolOptions, pub cancel_set: CancelSet, pub client_conn_threshold: u64, diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 7a9637066f..15fc0134b3 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -292,7 +292,7 @@ pub struct NodeInfo { impl NodeInfo { pub async fn connect( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, timeout: Duration, ) -> Result { self.config @@ -330,20 +330,20 @@ pub(crate) trait Api { /// We still have to mock the scram to avoid leaking information that user doesn't exist. async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result; async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result; } @@ -363,7 +363,7 @@ pub enum ConsoleBackend { impl Api for ConsoleBackend { async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { use ConsoleBackend::*; @@ -378,7 +378,7 @@ impl Api for ConsoleBackend { async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { use ConsoleBackend::*; @@ -393,7 +393,7 @@ impl Api for ConsoleBackend { async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { use ConsoleBackend::*; diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index cfe491f2aa..2093da7562 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -158,7 +158,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached( @@ -168,7 +168,7 @@ impl super::Api for Api { async fn get_allowed_ips_and_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { Ok(( @@ -182,7 +182,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &ComputeUserInfo, ) -> Result { self.do_wake_compute().map_ok(Cached::new_uncached).await diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 768cd2fdfa..7eda238b66 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -57,7 +57,7 @@ impl Api { async fn do_get_auth_info( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { if !self @@ -69,7 +69,7 @@ impl Api { info!("endpoint is not valid, skipping the request"); return Ok(AuthInfo::default()); } - let request_id = ctx.session_id.to_string(); + let request_id = ctx.session_id().to_string(); let application_name = ctx.console_application_name(); async { let request = self @@ -77,7 +77,7 @@ impl Api { .get("proxy_get_role_secret") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id)]) + .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", application_name.as_str()), ("project", user_info.endpoint.as_str()), @@ -87,7 +87,7 @@ impl Api { info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); @@ -130,10 +130,10 @@ impl Api { async fn do_wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let request_id = ctx.session_id.to_string(); + let request_id = ctx.session_id().to_string(); let application_name = ctx.console_application_name(); async { let mut request_builder = self @@ -141,7 +141,7 @@ impl Api { .get("proxy_wake_compute") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id)]) + .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", application_name.as_str()), ("project", user_info.endpoint.as_str()), @@ -156,7 +156,7 @@ impl Api { info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); @@ -192,7 +192,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { let normalized_ep = &user_info.endpoint.normalize(); @@ -226,7 +226,7 @@ impl super::Api for Api { async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { let normalized_ep = &user_info.endpoint.normalize(); @@ -268,7 +268,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { let key = user_info.endpoint_cache_key(); diff --git a/proxy/src/context.rs b/proxy/src/context.rs index ff79ba8275..e925f67233 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -7,13 +7,14 @@ use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; use tracing::{field::display, info, info_span, Span}; +use try_lock::TryLock; use uuid::Uuid; use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, intern::{BranchIdInt, ProjectIdInt}, - metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol}, + metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting}, DbName, EndpointId, RoleName, }; @@ -28,7 +29,15 @@ pub static LOG_CHAN_DISCONNECT: OnceCell> /// /// This data should **not** be used for connection logic, only for observability and limiting purposes. /// All connection logic should instead use strongly typed state machines, not a bunch of Options. -pub struct RequestMonitoring { +pub struct RequestMonitoring( + /// To allow easier use of the ctx object, we have interior mutability. + /// I would typically use a RefCell but that would break the `Send` requirements + /// so we need something with thread-safety. `TryLock` is a cheap alternative + /// that offers similar semantics to a `RefCell` but with synchronisation. + TryLock, +); + +struct RequestMonitoringInner { pub peer_addr: IpAddr, pub session_id: Uuid, pub protocol: Protocol, @@ -85,7 +94,7 @@ impl RequestMonitoring { role = tracing::field::Empty, ); - Self { + let inner = RequestMonitoringInner { peer_addr, session_id, protocol, @@ -110,7 +119,9 @@ impl RequestMonitoring { disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), disconnect_timestamp: None, - } + }; + + Self(TryLock::new(inner)) } #[cfg(test)] @@ -119,48 +130,177 @@ impl RequestMonitoring { } pub fn console_application_name(&self) -> String { + let this = self.0.try_lock().expect("should not deadlock"); format!( "{}/{}", - self.application.as_deref().unwrap_or_default(), - self.protocol + this.application.as_deref().unwrap_or_default(), + this.protocol ) } - pub fn set_rejected(&mut self, rejected: bool) { - self.rejected = Some(rejected); + pub fn set_rejected(&self, rejected: bool) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.rejected = Some(rejected); } - pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { + pub fn set_cold_start_info(&self, info: ColdStartInfo) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_cold_start_info(info); + } + + pub fn set_db_options(&self, options: StartupMessageParams) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.set_application(options.get("application_name").map(SmolStr::from)); + if let Some(user) = options.get("user") { + this.set_user(user.into()); + } + if let Some(dbname) = options.get("database") { + this.set_dbname(dbname.into()); + } + + this.pg_options = Some(options); + } + + pub fn set_project(&self, x: MetricsAuxInfo) { + let mut this = self.0.try_lock().expect("should not deadlock"); + if this.endpoint_id.is_none() { + this.set_endpoint_id(x.endpoint_id.as_str().into()) + } + this.branch = Some(x.branch_id); + this.project = Some(x.project_id); + this.set_cold_start_info(x.cold_start_info); + } + + pub fn set_project_id(&self, project_id: ProjectIdInt) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.project = Some(project_id); + } + + pub fn set_endpoint_id(&self, endpoint_id: EndpointId) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_endpoint_id(endpoint_id); + } + + pub fn set_dbname(&self, dbname: DbName) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_dbname(dbname); + } + + pub fn set_user(&self, user: RoleName) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_user(user); + } + + pub fn set_auth_method(&self, auth_method: AuthMethod) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.auth_method = Some(auth_method); + } + + pub fn has_private_peer_addr(&self) -> bool { + self.0 + .try_lock() + .expect("should not deadlock") + .has_private_peer_addr() + } + + pub fn set_error_kind(&self, kind: ErrorKind) { + let mut this = self.0.try_lock().expect("should not deadlock"); + // Do not record errors from the private address to metrics. + if !this.has_private_peer_addr() { + Metrics::get().proxy.errors_total.inc(kind); + } + if let Some(ep) = &this.endpoint_id { + let metric = &Metrics::get().proxy.endpoints_affected_by_errors; + let label = metric.with_labels(kind); + metric.get_metric(label).measure(ep); + } + this.error_kind = Some(kind); + } + + pub fn set_success(&self) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.success = true; + } + + pub fn log_connect(&self) { + self.0 + .try_lock() + .expect("should not deadlock") + .log_connect(); + } + + pub fn protocol(&self) -> Protocol { + self.0.try_lock().expect("should not deadlock").protocol + } + + pub fn span(&self) -> Span { + self.0.try_lock().expect("should not deadlock").span.clone() + } + + pub fn session_id(&self) -> Uuid { + self.0.try_lock().expect("should not deadlock").session_id + } + + pub fn peer_addr(&self) -> IpAddr { + self.0.try_lock().expect("should not deadlock").peer_addr + } + + pub fn cold_start_info(&self) -> ColdStartInfo { + self.0 + .try_lock() + .expect("should not deadlock") + .cold_start_info + } + + pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause { + LatencyTimerPause { + ctx: self, + start: tokio::time::Instant::now(), + waiting_for, + } + } + + pub fn success(&self) { + self.0 + .try_lock() + .expect("should not deadlock") + .latency_timer + .success() + } +} + +pub struct LatencyTimerPause<'a> { + ctx: &'a RequestMonitoring, + start: tokio::time::Instant, + waiting_for: Waiting, +} + +impl Drop for LatencyTimerPause<'_> { + fn drop(&mut self) { + self.ctx + .0 + .try_lock() + .expect("should not deadlock") + .latency_timer + .unpause(self.start, self.waiting_for); + } +} + +impl RequestMonitoringInner { + fn set_cold_start_info(&mut self, info: ColdStartInfo) { self.cold_start_info = info; self.latency_timer.cold_start_info(info); } - pub fn set_db_options(&mut self, options: StartupMessageParams) { - self.set_application(options.get("application_name").map(SmolStr::from)); - if let Some(user) = options.get("user") { - self.set_user(user.into()); - } - if let Some(dbname) = options.get("database") { - self.set_dbname(dbname.into()); - } - - self.pg_options = Some(options); - } - - pub fn set_project(&mut self, x: MetricsAuxInfo) { - if self.endpoint_id.is_none() { - self.set_endpoint_id(x.endpoint_id.as_str().into()) - } - self.branch = Some(x.branch_id); - self.project = Some(x.project_id); - self.set_cold_start_info(x.cold_start_info); - } - - pub fn set_project_id(&mut self, project_id: ProjectIdInt) { - self.project = Some(project_id); - } - - pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { + fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { if self.endpoint_id.is_none() { self.span.record("ep", display(&endpoint_id)); let metric = &Metrics::get().proxy.connecting_endpoints; @@ -176,44 +316,23 @@ impl RequestMonitoring { } } - pub fn set_dbname(&mut self, dbname: DbName) { + fn set_dbname(&mut self, dbname: DbName) { self.dbname = Some(dbname); } - pub fn set_user(&mut self, user: RoleName) { + fn set_user(&mut self, user: RoleName) { self.span.record("role", display(&user)); self.user = Some(user); } - pub fn set_auth_method(&mut self, auth_method: AuthMethod) { - self.auth_method = Some(auth_method); - } - - pub fn has_private_peer_addr(&self) -> bool { + fn has_private_peer_addr(&self) -> bool { match self.peer_addr { IpAddr::V4(ip) => ip.is_private(), _ => false, } } - pub fn set_error_kind(&mut self, kind: ErrorKind) { - // Do not record errors from the private address to metrics. - if !self.has_private_peer_addr() { - Metrics::get().proxy.errors_total.inc(kind); - } - if let Some(ep) = &self.endpoint_id { - let metric = &Metrics::get().proxy.endpoints_affected_by_errors; - let label = metric.with_labels(kind); - metric.get_metric(label).measure(ep); - } - self.error_kind = Some(kind); - } - - pub fn set_success(&mut self) { - self.success = true; - } - - pub fn log_connect(&mut self) { + fn log_connect(&mut self) { let outcome = if self.success { ConnectOutcome::Success } else { @@ -256,7 +375,7 @@ impl RequestMonitoring { } } -impl Drop for RequestMonitoring { +impl Drop for RequestMonitoringInner { fn drop(&mut self) { if self.sender.is_some() { self.log_connect(); diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 543a458274..bb02a476fc 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -23,7 +23,7 @@ use utils::backoff; use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; -use super::{RequestMonitoring, LOG_CHAN}; +use super::{RequestMonitoringInner, LOG_CHAN}; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { @@ -118,8 +118,8 @@ impl<'a> serde::Serialize for Options<'a> { } } -impl From<&RequestMonitoring> for RequestData { - fn from(value: &RequestMonitoring) -> Self { +impl From<&RequestMonitoringInner> for RequestData { + fn from(value: &RequestMonitoringInner) -> Self { Self { session_id: value.session_id, peer_addr: value.peer_addr.to_string(), diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index db25ac0311..0167553e30 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -370,6 +370,7 @@ pub struct CancellationRequest { pub kind: CancellationOutcome, } +#[derive(Clone, Copy)] pub enum Waiting { Cplane, Client, @@ -398,12 +399,6 @@ pub struct LatencyTimer { outcome: ConnectOutcome, } -pub struct LatencyTimerPause<'a> { - timer: &'a mut LatencyTimer, - start: time::Instant, - waiting_for: Waiting, -} - impl LatencyTimer { pub fn new(protocol: Protocol) -> Self { Self { @@ -417,11 +412,13 @@ impl LatencyTimer { } } - pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> { - LatencyTimerPause { - timer: self, - start: Instant::now(), - waiting_for, + pub fn unpause(&mut self, start: Instant, waiting_for: Waiting) { + let dur = start.elapsed(); + match waiting_for { + Waiting::Cplane => self.accumulated.cplane += dur, + Waiting::Client => self.accumulated.client += dur, + Waiting::Compute => self.accumulated.compute += dur, + Waiting::RetryTimeout => self.accumulated.retry += dur, } } @@ -438,18 +435,6 @@ impl LatencyTimer { } } -impl Drop for LatencyTimerPause<'_> { - fn drop(&mut self) { - let dur = self.start.elapsed(); - match self.waiting_for { - Waiting::Cplane => self.timer.accumulated.cplane += dur, - Waiting::Client => self.timer.accumulated.client += dur, - Waiting::Compute => self.timer.accumulated.compute += dur, - Waiting::RetryTimeout => self.timer.accumulated.retry += dur, - } - } -} - #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] pub enum ConnectOutcome { Success, diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 3edefcf21a..2182f38fe7 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -113,18 +113,18 @@ pub async fn task_main( } }; - let mut ctx = RequestMonitoring::new( + let ctx = RequestMonitoring::new( session_id, peer_addr, crate::metrics::Protocol::Tcp, &config.region, ); - let span = ctx.span.clone(); + let span = ctx.span(); let startup = Box::pin( handle_client( config, - &mut ctx, + &ctx, cancellation_handler, socket, ClientMode::Tcp, @@ -240,7 +240,7 @@ impl ReportableError for ClientRequestError { pub async fn handle_client( config: &'static ProxyConfig, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, cancellation_handler: Arc, stream: S, mode: ClientMode, @@ -248,25 +248,25 @@ pub async fn handle_client( conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { info!( - protocol = %ctx.protocol, + protocol = %ctx.protocol(), "handling interactive connection from client" ); let metrics = &Metrics::get().proxy; - let proto = ctx.protocol; + let proto = ctx.protocol(); let _request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); let record_handshake_error = !ctx.has_private_peer_addr(); - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client); - let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); + let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error); let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { return Ok(cancellation_handler - .cancel_session(cancel_key_data, ctx.session_id) + .cancel_session(cancel_key_data, ctx.session_id()) .await .map(|()| None)?) } diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 82180aaee3..f38e43ba5a 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -46,7 +46,7 @@ pub trait ConnectMechanism { type Error: From; async fn connect_once( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result; @@ -58,7 +58,7 @@ pub trait ConnectMechanism { pub trait ComputeConnectBackend { async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result; fn get_keys(&self) -> Option<&ComputeCredentialKeys>; @@ -81,7 +81,7 @@ impl ConnectMechanism for TcpMechanism<'_> { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] async fn connect_once( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result { @@ -98,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> { /// Try to connect to the compute node, retrying if necessary. #[tracing::instrument(skip_all)] pub async fn connect_to_compute( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, mechanism: &M, user_info: &B, allow_self_signed_compute: bool, @@ -126,7 +126,7 @@ where .await { Ok(res) => { - ctx.latency_timer.success(); + ctx.success(); Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, @@ -178,7 +178,7 @@ where .await { Ok(res) => { - ctx.latency_timer.success(); + ctx.success(); Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, @@ -209,9 +209,7 @@ where let wait_duration = retry_after(num_retries, connect_to_compute_retry_config); num_retries += 1; - let pause = ctx - .latency_timer - .pause(crate::metrics::Waiting::RetryTimeout); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); time::sleep(wait_duration).await; drop(pause); } diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index d488aea927..c65a5558d9 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -10,6 +10,7 @@ use tracing::{info, warn}; use crate::{ auth::endpoint_sni, config::{TlsConfig, PG_ALPN_PROTOCOL}, + context::RequestMonitoring, error::ReportableError, metrics::Metrics, proxy::ERR_INSECURE_CONNECTION, @@ -67,6 +68,7 @@ pub enum HandshakeData { /// we also take an extra care of propagating only the select handshake errors to client. #[tracing::instrument(skip_all)] pub async fn handshake( + ctx: &RequestMonitoring, stream: S, mut tls: Option<&TlsConfig>, record_handshake_error: bool, @@ -80,8 +82,6 @@ pub async fn handshake( let mut stream = PqStream::new(Stream::from_raw(stream)); loop { let msg = stream.read_startup_packet().await?; - info!("received {msg:?}"); - use FeStartupPacket::*; match msg { SslRequest { direct } => match stream.get_ref() { @@ -145,16 +145,20 @@ pub async fn handshake( let conn_info = tls_stream.get_ref().1; + // try parse endpoint + let ep = conn_info + .server_name() + .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten()); + if let Some(ep) = ep { + ctx.set_endpoint_id(ep); + } + // check the ALPN, if exists, as required. match conn_info.alpn_protocol() { None | Some(PG_ALPN_PROTOCOL) => {} Some(other) => { - // try parse ep for better error - let ep = conn_info.server_name().and_then(|sni| { - endpoint_sni(sni, &tls.common_names).ok().flatten() - }); let alpn = String::from_utf8_lossy(other); - warn!(?ep, %alpn, "unexpected ALPN"); + warn!(%alpn, "unexpected ALPN"); return Err(HandshakeError::ProtocolViolation); } } @@ -198,7 +202,12 @@ pub async fn handshake( .await?; } - info!(?version, session_type = "normal", "successful handshake"); + info!( + ?version, + ?params, + session_type = "normal", + "successful handshake" + ); break Ok(HandshakeData::Startup(stream, params)); } // downgrade protocol version diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 5186a9e1b0..d8308c4f2a 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -155,7 +155,7 @@ impl TestAuth for Scram { stream: &mut PqStream>, ) -> anyhow::Result<()> { let outcome = auth::AuthFlow::new(stream) - .begin(auth::Scram(&self.0, &mut RequestMonitoring::test())) + .begin(auth::Scram(&self.0, &RequestMonitoring::test())) .await? .authenticate() .await?; @@ -175,10 +175,11 @@ async fn dummy_proxy( auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let (client, _) = read_proxy_protocol(client).await?; - let mut stream = match handshake(client, tls.as_ref(), false).await? { - HandshakeData::Startup(stream, _) => stream, - HandshakeData::Cancel(_) => bail!("cancellation not supported"), - }; + let mut stream = + match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? { + HandshakeData::Startup(stream, _) => stream, + HandshakeData::Cancel(_) => bail!("cancellation not supported"), + }; auth.authenticate(&mut stream).await?; @@ -457,7 +458,7 @@ impl ConnectMechanism for TestConnectMechanism { async fn connect_once( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _node_info: &console::CachedNodeInfo, _timeout: std::time::Duration, ) -> Result { @@ -565,7 +566,7 @@ fn helper_create_connect_info( async fn connect_to_compute_success() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -573,7 +574,7 @@ async fn connect_to_compute_success() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -583,7 +584,7 @@ async fn connect_to_compute_success() { async fn connect_to_compute_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -591,7 +592,7 @@ async fn connect_to_compute_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -602,7 +603,7 @@ async fn connect_to_compute_retry() { async fn connect_to_compute_non_retry_1() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -610,7 +611,7 @@ async fn connect_to_compute_non_retry_1() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); @@ -621,7 +622,7 @@ async fn connect_to_compute_non_retry_1() { async fn connect_to_compute_non_retry_2() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -629,7 +630,7 @@ async fn connect_to_compute_non_retry_2() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -641,7 +642,7 @@ async fn connect_to_compute_non_retry_3() { let _ = env_logger::try_init(); tokio::time::pause(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]); let user_info = helper_create_connect_info(&mechanism); @@ -656,7 +657,7 @@ async fn connect_to_compute_non_retry_3() { backoff_factor: 2.0, }; connect_to_compute( - &mut ctx, + &ctx, &mechanism, &user_info, false, @@ -673,7 +674,7 @@ async fn connect_to_compute_non_retry_3() { async fn wake_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -681,7 +682,7 @@ async fn wake_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -692,7 +693,7 @@ async fn wake_retry() { async fn wake_non_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -700,7 +701,7 @@ async fn wake_non_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index cbfc9f1358..c8ec2b2db6 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -1,7 +1,7 @@ //! Man-in-the-middle tests //! //! Channel binding should prevent a proxy server -//! - that has access to create valid certificates - +//! *that has access to create valid certificates* //! from controlling the TLS connection. use std::fmt::Debug; @@ -34,9 +34,14 @@ async fn proxy_mitm( tokio::spawn(async move { // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; - let (end_client, startup) = match handshake(client1, Some(&server_config1), false) - .await - .unwrap() + let (end_client, startup) = match handshake( + &RequestMonitoring::test(), + client1, + Some(&server_config1), + false, + ) + .await + .unwrap() { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(_) => panic!("cancellation not supported"), diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index fef349aac0..5b06e8f054 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -14,7 +14,7 @@ use super::connect_compute::ComputeConnectBackend; pub async fn wake_compute( num_retries: &mut u32, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, api: &B, config: RetryConfig, ) -> Result { @@ -52,9 +52,7 @@ pub async fn wake_compute( let wait_duration = retry_after(*num_retries, config); *num_retries += 1; - let pause = ctx - .latency_timer - .pause(crate::metrics::Waiting::RetryTimeout); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); tokio::time::sleep(wait_duration).await; drop(pause); } diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index f2b794e5fe..e8e7ef5c86 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -158,7 +158,7 @@ mod tests { let N = 1021 * 4096; let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); - let memory = std::mem::size_of::() * sketch.buckets.len(); + let memory = size_of::() * sketch.buckets.len(); let time = sketch.depth; (memory, time) } diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index efa999ed7d..115bef7375 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -334,7 +334,7 @@ async fn request_handler( &config.region, ); - let span = ctx.span.clone(); + let span = ctx.span(); info!(parent: &span, "performing websocket upgrade"); let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request) @@ -367,7 +367,7 @@ async fn request_handler( crate::metrics::Protocol::Http, &config.region, ); - let span = ctx.span.clone(); + let span = ctx.span(); sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 2e9bb7d1ed..295ea1a1c7 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -35,15 +35,15 @@ pub struct PoolingBackend { impl PoolingBackend { pub async fn authenticate( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, config: &AuthenticationConfig, conn_info: &ConnInfo, ) -> Result { let user_info = conn_info.user_info.clone(); let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone()); let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; - if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(AuthError::ip_address_not_allowed(ctx.peer_addr)); + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + return Err(AuthError::ip_address_not_allowed(ctx.peer_addr())); } if !self .endpoint_rate_limiter @@ -100,7 +100,7 @@ impl PoolingBackend { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] pub async fn connect_to_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, conn_info: ConnInfo, keys: ComputeCredentials, force_new: bool, @@ -222,7 +222,7 @@ impl ConnectMechanism for TokioMechanism { async fn connect_once( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { @@ -236,7 +236,7 @@ impl ConnectMechanism for TokioMechanism { .dbname(&self.conn_info.dbname) .connect_timeout(timeout); - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let res = config.connect(tokio_postgres::NoTls).await; drop(pause); let (client, connection) = permit.release_result(res)?; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index dbc58d48ec..e1dc44dc1c 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -377,7 +377,7 @@ impl GlobalConnPool { pub fn get( self: &Arc, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { let mut client: Option> = None; @@ -409,9 +409,9 @@ impl GlobalConnPool { cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), "pool: reusing connection '{conn_info}'" ); - client.session.send(ctx.session_id)?; + client.session.send(ctx.session_id())?; ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); - ctx.latency_timer.success(); + ctx.success(); return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } } @@ -465,19 +465,19 @@ impl GlobalConnPool { pub fn poll_client( global_pool: Arc>, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, conn_info: ConnInfo, client: C, mut connection: tokio_postgres::Connection, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { - let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol); - let mut session_id = ctx.session_id; + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); + let mut session_id = ctx.session_id(); let (tx, mut rx) = tokio::sync::watch::channel(session_id); let span = info_span!(parent: None, "connection", %conn_id); - let cold_start_info = ctx.cold_start_info; + let cold_start_info = ctx.cold_start_info(); span.in_scope(|| { info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); @@ -766,7 +766,6 @@ mod tests { opt_in: false, max_total_conns: 3, }, - request_timeout: Duration::from_secs(1), cancel_set: CancelSet::new(0), client_conn_threshold: u64::MAX, })); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index c7fbfddf5e..e5b6536328 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -144,7 +144,7 @@ impl UserFacingError for ConnInfoError { } fn get_conn_info( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, headers: &HeaderMap, tls: &TlsConfig, ) -> Result { @@ -223,12 +223,12 @@ fn get_conn_info( // TODO: return different http error codes pub async fn handle( config: &'static ProxyConfig, - mut ctx: RequestMonitoring, + ctx: RequestMonitoring, request: Request, backend: Arc, cancel: CancellationToken, ) -> Result>, ApiError> { - let result = handle_inner(cancel, config, &mut ctx, request, backend).await; + let result = handle_inner(cancel, config, &ctx, request, backend).await; let mut response = match result { Ok(r) => { @@ -481,13 +481,16 @@ fn map_isolation_level_to_headers(level: IsolationLevel) -> Option async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, request: Request, backend: Arc, ) -> Result>, SqlOverHttpError> { - let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol); + let _requeset_gauge = Metrics::get() + .proxy + .connection_requests + .guard(ctx.protocol()); info!( - protocol = %ctx.protocol, + protocol = %ctx.protocol(), "handling interactive connection from client" ); @@ -543,7 +546,7 @@ async fn handle_inner( .await?; // not strictly necessary to mark success here, // but it's just insurance for if we forget it somewhere else - ctx.latency_timer.success(); + ctx.success(); Ok::<_, HttpConnError>(client) } .map_err(SqlOverHttpError::from), diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 0d5b88f07b..4fba4d141c 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -129,7 +129,7 @@ impl AsyncBufRead for WebSocketRw { pub async fn serve_websocket( config: &'static ProxyConfig, - mut ctx: RequestMonitoring, + ctx: RequestMonitoring, websocket: OnUpgrade, cancellation_handler: Arc, endpoint_rate_limiter: Arc, @@ -145,7 +145,7 @@ pub async fn serve_websocket( let res = Box::pin(handle_client( config, - &mut ctx, + &ctx, cancellation_handler, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, diff --git a/pyproject.toml b/pyproject.toml index c7f1a07512..cfb569b2ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,7 @@ [tool.poetry] -name = "neon" -version = "0.1.0" description = "" authors = [] +package-mode = false [tool.poetry.dependencies] python = "^3.9" @@ -41,6 +40,8 @@ zstandard = "^0.21.0" httpx = {extras = ["http2"], version = "^0.26.0"} pytest-repeat = "^0.9.3" websockets = "^12.0" +clickhouse-connect = "^0.7.16" +kafka-python = "^2.0.2" [tool.poetry.group.dev.dependencies] mypy = "==1.3.0" @@ -74,6 +75,7 @@ module = [ "allure.*", "allure_commons.*", "allure_pytest.*", + "kafka.*", ] ignore_missing_imports = true diff --git a/rust-toolchain.toml b/rust-toolchain.toml index dcae25a287..3510359591 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.79.0" +channel = "1.80.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 2365fd0587..41c2d3fe08 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -170,11 +170,6 @@ struct Args { /// still needed for existing replication connection. #[arg(long)] walsenders_keep_horizon: bool, - /// Enable partial backup. If disabled, safekeeper will not upload partial - /// segments to remote storage. - /// TODO: now partial backup is always enabled, remove this flag. - #[arg(long)] - partial_backup_enabled: bool, /// Controls how long backup will wait until uploading the partial segment. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)] partial_backup_timeout: Duration, @@ -347,7 +342,6 @@ async fn main() -> anyhow::Result<()> { sk_auth_token, current_thread_runtime: args.current_thread_runtime, walsenders_keep_horizon: args.walsenders_keep_horizon, - partial_backup_enabled: true, partial_backup_timeout: args.partial_backup_timeout, disable_periodic_broker_push: args.disable_periodic_broker_push, enable_offload: args.enable_offload, diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index cd3c7fe526..d574bb438f 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -27,7 +27,7 @@ pub const SK_FORMAT_VERSION: u32 = 9; pub const CONTROL_FILE_NAME: &str = "safekeeper.control"; // needed to atomically update the state using `rename` const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; -pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); +pub const CHECKSUM_SIZE: usize = size_of::(); /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index f45bfb95fa..2c519433ef 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -143,7 +143,12 @@ impl postgres_backend::Handler self.tenant_id.unwrap_or(TenantId::from([0u8; 16])), self.timeline_id.unwrap_or(TimelineId::from([0u8; 16])), ); - tracing::Span::current().record("ttid", tracing::field::display(ttid)); + tracing::Span::current() + .record("ttid", tracing::field::display(ttid)) + .record( + "application_name", + tracing::field::debug(self.appname.clone()), + ); Ok(()) } else { diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 8f2920ada3..2e11a279ca 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -21,6 +21,7 @@ pub mod json_ctrl; pub mod metrics; pub mod patch_control_file; pub mod pull_timeline; +pub mod rate_limit; pub mod receive_wal; pub mod recovery; pub mod remove_wal; @@ -53,6 +54,7 @@ pub mod defaults { pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m"; pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s"; pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5"; + pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2; // By default, our required residency before eviction is the same as the period that passes // before uploading a partial segment, so that in normal operation the eviction can happen @@ -91,7 +93,6 @@ pub struct SafeKeeperConf { pub sk_auth_token: Option, pub current_thread_runtime: bool, pub walsenders_keep_horizon: bool, - pub partial_backup_enabled: bool, pub partial_backup_timeout: Duration, pub disable_periodic_broker_push: bool, pub enable_offload: bool, @@ -135,7 +136,6 @@ impl SafeKeeperConf { max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, current_thread_runtime: false, walsenders_keep_horizon: false, - partial_backup_enabled: false, partial_backup_timeout: Duration::from_secs(0), disable_periodic_broker_push: false, enable_offload: false, diff --git a/safekeeper/src/rate_limit.rs b/safekeeper/src/rate_limit.rs new file mode 100644 index 0000000000..72373b5786 --- /dev/null +++ b/safekeeper/src/rate_limit.rs @@ -0,0 +1,49 @@ +use std::sync::Arc; + +use rand::Rng; + +use crate::metrics::MISC_OPERATION_SECONDS; + +/// Global rate limiter for background tasks. +#[derive(Clone)] +pub struct RateLimiter { + partial_backup: Arc, + eviction: Arc, +} + +impl RateLimiter { + /// Create a new rate limiter. + /// - `partial_backup_max`: maximum number of concurrent partial backups. + /// - `eviction_max`: maximum number of concurrent timeline evictions. + pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self { + Self { + partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)), + eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)), + } + } + + /// Get a permit for partial backup. This will block if the maximum number of concurrent + /// partial backups is reached. + pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["partial_permit_acquire"]) + .start_timer(); + self.partial_backup + .clone() + .acquire_owned() + .await + .expect("semaphore is closed") + } + + /// Try to get a permit for timeline eviction. This will return None if the maximum number of + /// concurrent timeline evictions is reached. + pub fn try_acquire_eviction(&self) -> Option { + self.eviction.clone().try_acquire_owned().ok() + } +} + +/// Generate a random duration that is a fraction of the given duration. +pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration { + let randf64 = rand::thread_rng().gen_range(0.0..1.0); + duration.mul_f64(randf64) +} diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 132e5ec32f..57935d879f 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -25,6 +25,7 @@ use utils::{ use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; +use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn, @@ -36,7 +37,7 @@ use crate::timeline_guard::ResidenceGuard; use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::{self}; -use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter}; +use crate::wal_backup_partial::PartialRemoteSegment; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS}; diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 7947d83eb4..ae6f3f4b7e 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -5,7 +5,6 @@ use anyhow::Context; use camino::Utf8PathBuf; use remote_storage::RemotePath; -use std::time::Instant; use tokio::{ fs::File, io::{AsyncRead, AsyncWriteExt}, @@ -15,6 +14,7 @@ use utils::crashsafe::durable_rename; use crate::{ metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED}, + rate_limit::rand_duration, timeline_manager::{Manager, StateSnapshot}, wal_backup, wal_backup_partial::{self, PartialRemoteSegment}, @@ -50,7 +50,6 @@ impl Manager { .flush_lsn .segment_number(self.wal_seg_size) == self.last_removed_segno + 1 - && self.resident_since.elapsed() >= self.conf.eviction_min_resident } /// Evict the timeline to remote storage. @@ -112,7 +111,8 @@ impl Manager { return; } - self.resident_since = Instant::now(); + self.evict_not_before = + tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident); info!("successfully restored evicted timeline"); } diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index debf8c824f..482614fac7 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -23,6 +23,7 @@ use utils::lsn::Lsn; use crate::{ control_file::{FileStorage, Storage}, metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS}, + rate_limit::{rand_duration, RateLimiter}, recovery::recovery_main, remove_wal::calc_horizon_lsn, safekeeper::Term, @@ -32,7 +33,7 @@ use crate::{ timeline_guard::{AccessService, GuardId, ResidenceGuard}, timelines_set::{TimelineSetGuard, TimelinesSet}, wal_backup::{self, WalBackupTaskHandle}, - wal_backup_partial::{self, PartialRemoteSegment, RateLimiter}, + wal_backup_partial::{self, PartialRemoteSegment}, SafeKeeperConf, }; @@ -185,11 +186,11 @@ pub(crate) struct Manager { // misc pub(crate) access_service: AccessService, - pub(crate) partial_backup_rate_limiter: RateLimiter, + pub(crate) global_rate_limiter: RateLimiter, // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not // evict them if they go inactive very soon after being restored. - pub(crate) resident_since: std::time::Instant, + pub(crate) evict_not_before: Instant, } /// This task gets spawned alongside each timeline and is responsible for managing the timeline's @@ -202,7 +203,7 @@ pub async fn main_task( broker_active_set: Arc, manager_tx: tokio::sync::mpsc::UnboundedSender, mut manager_rx: tokio::sync::mpsc::UnboundedReceiver, - partial_backup_rate_limiter: RateLimiter, + global_rate_limiter: RateLimiter, ) { tli.set_status(Status::Started); @@ -220,7 +221,7 @@ pub async fn main_task( conf, broker_active_set, manager_tx, - partial_backup_rate_limiter, + global_rate_limiter, ) .await; @@ -254,9 +255,29 @@ pub async fn main_task( mgr.set_status(Status::UpdatePartialBackup); mgr.update_partial_backup(&state_snapshot).await; - if mgr.conf.enable_offload && mgr.ready_for_eviction(&next_event, &state_snapshot) { - mgr.set_status(Status::EvictTimeline); - mgr.evict_timeline().await; + let now = Instant::now(); + if mgr.evict_not_before > now { + // we should wait until evict_not_before + update_next_event(&mut next_event, mgr.evict_not_before); + } + + if mgr.conf.enable_offload + && mgr.evict_not_before <= now + && mgr.ready_for_eviction(&next_event, &state_snapshot) + { + // check rate limiter and evict timeline if possible + match mgr.global_rate_limiter.try_acquire_eviction() { + Some(_permit) => { + mgr.set_status(Status::EvictTimeline); + mgr.evict_timeline().await; + } + None => { + // we can't evict timeline now, will try again later + mgr.evict_not_before = + Instant::now() + rand_duration(&mgr.conf.eviction_min_resident); + update_next_event(&mut next_event, mgr.evict_not_before); + } + } } } @@ -334,11 +355,10 @@ impl Manager { conf: SafeKeeperConf, broker_active_set: Arc, manager_tx: tokio::sync::mpsc::UnboundedSender, - partial_backup_rate_limiter: RateLimiter, + global_rate_limiter: RateLimiter, ) -> Manager { let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await; Manager { - conf, wal_seg_size: tli.get_wal_seg_size().await, walsenders: tli.get_walsenders().clone(), state_version_rx: tli.get_state_version_rx(), @@ -353,8 +373,10 @@ impl Manager { partial_backup_uploaded, access_service: AccessService::new(manager_tx), tli, - partial_backup_rate_limiter, - resident_since: std::time::Instant::now(), + global_rate_limiter, + // to smooth out evictions spike after restart + evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident), + conf, } } @@ -522,8 +544,8 @@ impl Manager { /// Spawns partial WAL backup task if needed. async fn update_partial_backup(&mut self, state: &StateSnapshot) { - // check if partial backup is enabled and should be started - if !self.conf.is_wal_backup_enabled() || !self.conf.partial_backup_enabled { + // check if WAL backup is enabled and should be started + if !self.conf.is_wal_backup_enabled() { return; } @@ -541,7 +563,7 @@ impl Manager { self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task( self.wal_resident_timeline(), self.conf.clone(), - self.partial_backup_rate_limiter.clone(), + self.global_rate_limiter.clone(), ))); } diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index f57da5c7cb..6662e18817 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -2,10 +2,11 @@ //! All timelines should always be present in this map, this is done by loading them //! all from the disk on startup and keeping them in memory. +use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; +use crate::rate_limit::RateLimiter; use crate::safekeeper::ServerInfo; use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; use crate::timelines_set::TimelinesSet; -use crate::wal_backup_partial::RateLimiter; use crate::SafeKeeperConf; use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; @@ -31,7 +32,7 @@ struct GlobalTimelinesState { conf: Option, broker_active_set: Arc, load_lock: Arc>, - partial_backup_rate_limiter: RateLimiter, + global_rate_limiter: RateLimiter, } // Used to prevent concurrent timeline loading. @@ -50,7 +51,7 @@ impl GlobalTimelinesState { ( self.get_conf().clone(), self.broker_active_set.clone(), - self.partial_backup_rate_limiter.clone(), + self.global_rate_limiter.clone(), ) } @@ -85,7 +86,7 @@ static TIMELINES_STATE: Lazy> = Lazy::new(|| { conf: None, broker_active_set: Arc::new(TimelinesSet::default()), load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)), - partial_backup_rate_limiter: RateLimiter::new(1), + global_rate_limiter: RateLimiter::new(1, 1), }) }); @@ -99,7 +100,10 @@ impl GlobalTimelines { // lock, so use explicit block let tenants_dir = { let mut state = TIMELINES_STATE.lock().unwrap(); - state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency); + state.global_rate_limiter = RateLimiter::new( + conf.partial_backup_concurrency, + DEFAULT_EVICTION_CONCURRENCY, + ); state.conf = Some(conf); // Iterate through all directories and load tenants for all directories diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 7ecee178f3..234273e133 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -545,7 +545,10 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { &cancel, ) .await? - .keys; + .keys + .into_iter() + .map(|o| o.key) + .collect::>(); if files.is_empty() { return Ok(()); // done } @@ -613,7 +616,7 @@ pub async fn copy_s3_segments( let uploaded_segments = &files .iter() - .filter_map(|file| file.object_name().map(ToOwned::to_owned)) + .filter_map(|o| o.key.object_name().map(ToOwned::to_owned)) .collect::>(); debug!( diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index b1efa9749f..52765b0e98 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -18,8 +18,6 @@ //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. -use std::sync::Arc; - use camino::Utf8PathBuf; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; @@ -30,6 +28,7 @@ use utils::lsn::Lsn; use crate::{ metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, + rate_limit::{rand_duration, RateLimiter}, safekeeper::Term, timeline::WalResidentTimeline, timeline_manager::StateSnapshot, @@ -37,30 +36,6 @@ use crate::{ SafeKeeperConf, }; -#[derive(Clone)] -pub struct RateLimiter { - semaphore: Arc, -} - -impl RateLimiter { - pub fn new(permits: usize) -> Self { - Self { - semaphore: Arc::new(tokio::sync::Semaphore::new(permits)), - } - } - - async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit { - let _timer = MISC_OPERATION_SECONDS - .with_label_values(&["partial_permit_acquire"]) - .start_timer(); - self.semaphore - .clone() - .acquire_owned() - .await - .expect("semaphore is closed") - } -} - #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum UploadStatus { /// Upload is in progress. This status should be used only for garbage collection, @@ -352,6 +327,7 @@ pub async fn main_task( ) -> Option { debug!("started"); let await_duration = conf.partial_backup_timeout; + let mut first_iteration = true; let (_, persistent_state) = tli.get_state().await; let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); @@ -419,6 +395,15 @@ pub async fn main_task( } } + // smoothing the load after restart, by sleeping for a random time. + // if this is not the first iteration, we will wait for the full await_duration + let await_duration = if first_iteration { + first_iteration = false; + rand_duration(&await_duration) + } else { + await_duration + }; + // fixing the segno and waiting some time to prevent reuploading the same segment too often let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn); let timeout = tokio::time::sleep(await_duration); @@ -454,7 +439,7 @@ pub async fn main_task( } // limit concurrent uploads - let _upload_permit = limiter.acquire_owned().await; + let _upload_permit = limiter.acquire_partial_backup().await; let prepared = backup.prepare_upload().await; if let Some(seg) = &uploaded_segment { diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 091571111e..16f7748eb4 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -43,7 +43,7 @@ pub async fn task_main( error!("connection handler exited: {}", err); } } - .instrument(info_span!("", cid = %conn_id, ttid = field::Empty)), + .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)), ); } } diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 0c6d97ddfa..771d905c90 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -181,7 +181,6 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { sk_auth_token: None, current_thread_runtime: false, walsenders_keep_horizon: false, - partial_backup_enabled: false, partial_backup_timeout: Duration::from_secs(0), disable_periodic_broker_push: false, enable_offload: false, diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs index aa329bd2f0..123cd6bad6 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -172,7 +172,7 @@ fn write_walrecord_to_disk( let mut freespace = insert_freespace(curr_ptr); let mut written: usize = 0; - assert!(freespace >= std::mem::size_of::()); + assert!(freespace >= size_of::()); for mut rdata in rdatas { while rdata.len() >= freespace { diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py index 01f34a1b96..4ca433679a 100755 --- a/scripts/benchmark_durations.py +++ b/scripts/benchmark_durations.py @@ -67,6 +67,7 @@ FALLBACK_DURATION = { "test_runner/performance/test_copy.py::test_copy[neon]": 13.817, "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736, "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735, + "test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735, "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868, "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393, "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588, diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 0a4af543ab..15acd0e49c 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -642,8 +642,7 @@ async fn main() -> Result<(), Box> { logging::replace_panic_hook_with_tracing_panic_hook().forget(); // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - info!("version: {GIT_VERSION}"); - info!("build_tag: {BUILD_TAG}"); + info!("version: {GIT_VERSION} build_tag: {BUILD_TAG}"); metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); // On any shutdown signal, log receival and exit. diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index b54dea5d47..ecaac04915 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -18,6 +18,7 @@ anyhow.workspace = true aws-config.workspace = true bytes.workspace = true camino.workspace = true +chrono.workspace = true clap.workspace = true fail.workspace = true futures.workspace = true @@ -31,6 +32,7 @@ once_cell.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true postgres_connection.workspace = true +rand.workspace = true reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true serde.workspace = true @@ -44,7 +46,12 @@ scopeguard.workspace = true strum.workspace = true strum_macros.workspace = true -diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] } +diesel = { version = "2.1.4", features = [ + "serde_json", + "postgres", + "r2d2", + "chrono", +] } diesel_migrations = { version = "2.1.0" } r2d2 = { version = "0.8.10" } @@ -52,4 +59,3 @@ utils = { path = "../libs/utils/" } metrics = { path = "../libs/metrics/" } control_plane = { path = "../control_plane" } workspace_hack = { version = "0.1", path = "../workspace_hack" } - diff --git a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql new file mode 100644 index 0000000000..1ecfc8786f --- /dev/null +++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql @@ -0,0 +1 @@ +DROP TABLE metadata_health; \ No newline at end of file diff --git a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql new file mode 100644 index 0000000000..fa87eda119 --- /dev/null +++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql @@ -0,0 +1,14 @@ +CREATE TABLE metadata_health ( + tenant_id VARCHAR NOT NULL, + shard_number INTEGER NOT NULL, + shard_count INTEGER NOT NULL, + PRIMARY KEY(tenant_id, shard_number, shard_count), + -- Rely on cascade behavior for delete + FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE, + healthy BOOLEAN NOT NULL DEFAULT TRUE, + last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + + +INSERT INTO metadata_health(tenant_id, shard_number, shard_count) +SELECT tenant_id, shard_number, shard_count FROM tenant_shards; diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 14cda0a289..1bb9c17f30 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -22,7 +22,8 @@ struct HeartbeaterTask { state: HashMap, - max_unavailable_interval: Duration, + max_offline_interval: Duration, + max_warming_up_interval: Duration, jwt_token: Option, } @@ -31,7 +32,9 @@ pub(crate) enum PageserverState { Available { last_seen_at: Instant, utilization: PageserverUtilization, - new: bool, + }, + WarmingUp { + started_at: Instant, }, Offline, } @@ -57,12 +60,18 @@ pub(crate) struct Heartbeater { impl Heartbeater { pub(crate) fn new( jwt_token: Option, - max_unavailable_interval: Duration, + max_offline_interval: Duration, + max_warming_up_interval: Duration, cancel: CancellationToken, ) -> Self { let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::(); - let mut heartbeater = - HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel); + let mut heartbeater = HeartbeaterTask::new( + receiver, + jwt_token, + max_offline_interval, + max_warming_up_interval, + cancel, + ); tokio::task::spawn(async move { heartbeater.run().await }); Self { sender } @@ -88,14 +97,16 @@ impl HeartbeaterTask { fn new( receiver: tokio::sync::mpsc::UnboundedReceiver, jwt_token: Option, - max_unavailable_interval: Duration, + max_offline_interval: Duration, + max_warming_up_interval: Duration, cancel: CancellationToken, ) -> Self { Self { receiver, cancel, state: HashMap::new(), - max_unavailable_interval, + max_offline_interval, + max_warming_up_interval, jwt_token, } } @@ -128,16 +139,15 @@ impl HeartbeaterTask { heartbeat_futs.push({ let jwt_token = self.jwt_token.clone(); let cancel = self.cancel.clone(); - let new_node = !self.state.contains_key(node_id); // Clone the node and mark it as available such that the request // goes through to the pageserver even when the node is marked offline. // This doesn't impact the availability observed by [`crate::service::Service`]. - let mut node = node.clone(); - node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + let mut node_clone = node.clone(); + node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst())); async move { - let response = node + let response = node_clone .with_client_retries( |client| async move { client.get_utilization().await }, &jwt_token, @@ -161,7 +171,12 @@ impl HeartbeaterTask { PageserverState::Available { last_seen_at: Instant::now(), utilization, - new: new_node, + } + } else if let NodeAvailability::WarmingUp(last_seen_at) = + node.get_availability() + { + PageserverState::WarmingUp { + started_at: last_seen_at, } } else { PageserverState::Offline @@ -187,53 +202,67 @@ impl HeartbeaterTask { } } } + + let mut warming_up = 0; + let mut offline = 0; + for state in new_state.values() { + match state { + PageserverState::WarmingUp { .. } => { + warming_up += 1; + } + PageserverState::Offline { .. } => offline += 1, + PageserverState::Available { .. } => {} + } + } + tracing::info!( - "Heartbeat round complete for {} nodes, {} offline", + "Heartbeat round complete for {} nodes, {} warming-up, {} offline", new_state.len(), - new_state - .values() - .filter(|s| match s { - PageserverState::Available { .. } => { - false - } - PageserverState::Offline => true, - }) - .count() + warming_up, + offline ); let mut deltas = Vec::new(); let now = Instant::now(); - for (node_id, ps_state) in new_state { + for (node_id, ps_state) in new_state.iter_mut() { use std::collections::hash_map::Entry::*; - let entry = self.state.entry(node_id); + let entry = self.state.entry(*node_id); let mut needs_update = false; match entry { Occupied(ref occ) => match (occ.get(), &ps_state) { (PageserverState::Offline, PageserverState::Offline) => {} (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => { - if now - *last_seen_at >= self.max_unavailable_interval { - deltas.push((node_id, ps_state.clone())); + if now - *last_seen_at >= self.max_offline_interval { + deltas.push((*node_id, ps_state.clone())); needs_update = true; } } + (_, PageserverState::WarmingUp { started_at }) => { + if now - *started_at >= self.max_warming_up_interval { + *ps_state = PageserverState::Offline; + } + + deltas.push((*node_id, ps_state.clone())); + needs_update = true; + } _ => { - deltas.push((node_id, ps_state.clone())); + deltas.push((*node_id, ps_state.clone())); needs_update = true; } }, Vacant(_) => { // This is a new node. Don't generate a delta for it. - deltas.push((node_id, ps_state.clone())); + deltas.push((*node_id, ps_state.clone())); } } match entry { Occupied(mut occ) if needs_update => { - (*occ.get_mut()) = ps_state; + (*occ.get_mut()) = ps_state.clone(); } Vacant(vac) => { - vac.insert(ps_state); + vac.insert(ps_state.clone()); } _ => {} } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 8fb4be93e0..e8513b31eb 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -3,14 +3,18 @@ use crate::metrics::{ METRICS_REGISTRY, }; use crate::reconciler::ReconcileError; -use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; +use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT}; use anyhow::Context; use futures::Future; use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; use metrics::{BuildInfo, NeonMetrics}; -use pageserver_api::controller_api::TenantCreateRequest; +use pageserver_api::controller_api::{ + MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse, + MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, + TenantCreateRequest, +}; use pageserver_api::models::{ TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineCreateRequest, @@ -560,6 +564,51 @@ async fn handle_cancel_node_fill(req: Request) -> Result, A json_response(StatusCode::ACCEPTED, ()) } +async fn handle_metadata_health_update(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Scrubber)?; + + let update_req = json_request::(&mut req).await?; + let state = get_state(&req); + + state.service.metadata_health_update(update_req).await?; + + json_response(StatusCode::OK, MetadataHealthUpdateResponse {}) +} + +async fn handle_metadata_health_list_unhealthy( + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?; + + json_response( + StatusCode::OK, + MetadataHealthListUnhealthyResponse { + unhealthy_tenant_shards, + }, + ) +} + +async fn handle_metadata_health_list_outdated( + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let list_outdated_req = json_request::(&mut req).await?; + let state = get_state(&req); + let health_records = state + .service + .metadata_health_list_outdated(list_outdated_req.not_scrubbed_for) + .await?; + + json_response( + StatusCode::OK, + MetadataHealthListOutdatedResponse { health_records }, + ) +} + async fn handle_tenant_shard_split( service: Arc, mut req: Request, @@ -607,6 +656,13 @@ async fn handle_tenant_update_policy(mut req: Request) -> Result) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + json_response(StatusCode::OK, state.service.step_down().await) +} + async fn handle_tenant_drop(req: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; @@ -734,6 +790,47 @@ struct RequestMeta { at: Instant, } +pub fn prologue_leadership_status_check_middleware< + B: hyper::body::HttpBody + Send + Sync + 'static, +>() -> Middleware { + Middleware::pre(move |req| async move { + let state = get_state(&req); + let leadership_status = state.service.get_leadership_status(); + + enum AllowedRoutes<'a> { + All, + Some(Vec<&'a str>), + } + + let allowed_routes = match leadership_status { + LeadershipStatus::Leader => AllowedRoutes::All, + LeadershipStatus::SteppedDown => { + // TODO: does it make sense to allow /status here? + AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec()) + } + LeadershipStatus::Candidate => { + AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) + } + }; + + let uri = req.uri().to_string(); + match allowed_routes { + AllowedRoutes::All => Ok(req), + AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req), + _ => { + tracing::info!( + "Request {} not allowed due to current leadership state", + req.uri() + ); + + Err(ApiError::ResourceUnavailable( + format!("Current leadership status is {leadership_status}").into(), + )) + } + } + }) +} + fn prologue_metrics_middleware( ) -> Middleware { Middleware::pre(move |req| async move { @@ -820,6 +917,7 @@ pub fn make_router( build_info: BuildInfo, ) -> RouterBuilder { let mut router = endpoint::make_router() + .middleware(prologue_leadership_status_check_middleware()) .middleware(prologue_metrics_middleware()) .middleware(epilogue_metrics_middleware()); if auth.is_some() { @@ -938,6 +1036,28 @@ pub fn make_router( RequestName("control_v1_cancel_node_fill"), ) }) + // Metadata health operations + .post("/control/v1/metadata_health/update", |r| { + named_request_span( + r, + handle_metadata_health_update, + RequestName("control_v1_metadata_health_update"), + ) + }) + .get("/control/v1/metadata_health/unhealthy", |r| { + named_request_span( + r, + handle_metadata_health_list_unhealthy, + RequestName("control_v1_metadata_health_list_unhealthy"), + ) + }) + .post("/control/v1/metadata_health/outdated", |r| { + named_request_span( + r, + handle_metadata_health_list_outdated, + RequestName("control_v1_metadata_health_list_outdated"), + ) + }) // TODO(vlad): endpoint for cancelling drain and fill // Tenant Shard operations .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| { @@ -971,6 +1091,9 @@ pub fn make_router( RequestName("control_v1_tenant_policy"), ) }) + .put("/control/v1/step_down", |r| { + named_request_span(r, handle_step_down, RequestName("control_v1_step_down")) + }) // Tenant operations // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 789f96beb3..2799f21fdc 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -9,11 +9,14 @@ use std::time::Duration; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; +use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, + RECONCILER_CONCURRENCY_DEFAULT, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; +use tracing::Instrument; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; @@ -61,7 +64,12 @@ struct Cli { /// Grace period before marking unresponsive pageserver offline #[arg(long)] - max_unavailable_interval: Option, + max_offline_interval: Option, + + /// More tolerant grace period before marking unresponsive pagserver offline used + /// around pageserver restarts + #[arg(long)] + max_warming_up_interval: Option, /// Size threshold for automatically splitting shards (disabled by default) #[arg(long)] @@ -80,6 +88,10 @@ struct Cli { // TODO: make `cfg(feature = "testing")` #[arg(long)] neon_local_repo_dir: Option, + + /// Chaos testing + #[arg(long)] + chaos_interval: Option, } enum StrictMode { @@ -254,10 +266,14 @@ async fn async_main() -> anyhow::Result<()> { jwt_token: secrets.jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, compute_hook_url: args.compute_hook_url, - max_unavailable_interval: args - .max_unavailable_interval + max_offline_interval: args + .max_offline_interval .map(humantime::Duration::into) - .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT), + .unwrap_or(MAX_OFFLINE_INTERVAL_DEFAULT), + max_warming_up_interval: args + .max_warming_up_interval + .map(humantime::Duration::into) + .unwrap_or(MAX_WARMING_UP_INTERVAL_DEFAULT), reconciler_concurrency: args .reconciler_concurrency .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), @@ -299,6 +315,22 @@ async fn async_main() -> anyhow::Result<()> { tracing::info!("Serving on {0}", args.listen); let server_task = tokio::task::spawn(server); + let chaos_task = args.chaos_interval.map(|interval| { + let service = service.clone(); + let cancel = CancellationToken::new(); + let cancel_bg = cancel.clone(); + ( + tokio::task::spawn( + async move { + let mut chaos_injector = ChaosInjector::new(service, interval.into()); + chaos_injector.run(cancel_bg).await + } + .instrument(tracing::info_span!("chaos_injector")), + ), + cancel, + ) + }); + // Wait until we receive a signal let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?; let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?; @@ -327,6 +359,12 @@ async fn async_main() -> anyhow::Result<()> { } } + // If we were injecting chaos, stop that so that we're not calling into Service while it shuts down + if let Some((chaos_jh, chaos_cancel)) = chaos_task { + chaos_cancel.cancel(); + chaos_jh.await.ok(); + } + service.shutdown().await; tracing::info!("Service shutdown complete"); diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index ac9f22c739..a1a4b8543d 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -13,7 +13,10 @@ use metrics::NeonMetrics; use once_cell::sync::Lazy; use std::sync::Mutex; -use crate::persistence::{DatabaseError, DatabaseOperation}; +use crate::{ + persistence::{DatabaseError, DatabaseOperation}, + service::LeadershipStatus, +}; pub(crate) static METRICS_REGISTRY: Lazy = Lazy::new(StorageControllerMetrics::default); @@ -81,6 +84,8 @@ pub(crate) struct StorageControllerMetricGroup { #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_database_query_latency: measured::HistogramVec, + + pub(crate) storage_controller_leadership_status: measured::GaugeVec, } impl StorageControllerMetrics { @@ -156,6 +161,12 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup { pub(crate) operation: DatabaseOperation, } +#[derive(measured::LabelGroup)] +#[label(set = LeadershipStatusGroupSet)] +pub(crate) struct LeadershipStatusGroup { + pub(crate) status: LeadershipStatus, +} + #[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { #[label(rename = "ok")] diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index fff44aaf26..ea765ca123 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration}; use pageserver_api::{ controller_api::{ NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, - TenantLocateResponseShard, UtilizationScore, + TenantLocateResponseShard, }, shard::TenantShardId, }; @@ -46,6 +46,8 @@ pub(crate) struct Node { /// whether/how they changed it. pub(crate) enum AvailabilityTransition { ToActive, + ToWarmingUpFromActive, + ToWarmingUpFromOffline, ToOffline, Unchanged, } @@ -90,22 +92,34 @@ impl Node { } } + pub(crate) fn get_availability(&self) -> NodeAvailability { + self.availability + } + pub(crate) fn set_availability(&mut self, availability: NodeAvailability) { + use AvailabilityTransition::*; + use NodeAvailability::WarmingUp; + match self.get_availability_transition(availability) { - AvailabilityTransition::ToActive => { + ToActive => { // Give the node a new cancellation token, effectively resetting it to un-cancelled. Any // users of previously-cloned copies of the node will still see the old cancellation // state. For example, Reconcilers in flight will have to complete and be spawned // again to realize that the node has become available. self.cancel = CancellationToken::new(); } - AvailabilityTransition::ToOffline => { + ToOffline | ToWarmingUpFromActive => { // Fire the node's cancellation token to cancel any in-flight API requests to it self.cancel.cancel(); } - AvailabilityTransition::Unchanged => {} + Unchanged | ToWarmingUpFromOffline => {} + } + + if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) { + self.availability = WarmingUp(std::cmp::max(crnt, proposed)); + } else { + self.availability = availability; } - self.availability = availability; } /// Without modifying the availability of the node, convert the intended availability @@ -120,16 +134,10 @@ impl Node { match (self.availability, availability) { (Offline, Active(_)) => ToActive, (Active(_), Offline) => ToOffline, - // Consider the case when the storage controller handles the re-attach of a node - // before the heartbeats detect that the node is back online. We still need - // [`Service::node_configure`] to attempt reconciliations for shards with an - // unknown observed location. - // The unsavoury match arm below handles this situation. - (Active(lhs), Active(rhs)) - if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() => - { - ToActive - } + (Active(_), WarmingUp(_)) => ToWarmingUpFromActive, + (WarmingUp(_), Offline) => ToOffline, + (WarmingUp(_), Active(_)) => ToActive, + (Offline, WarmingUp(_)) => ToWarmingUpFromOffline, _ => Unchanged, } } @@ -147,7 +155,7 @@ impl Node { pub(crate) fn may_schedule(&self) -> MaySchedule { let score = match self.availability { NodeAvailability::Active(score) => score, - NodeAvailability::Offline => return MaySchedule::No, + NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No, }; match self.scheduling { diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index d8f31e86e5..64a3e597ce 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -8,6 +8,7 @@ use self::split_state::SplitState; use diesel::pg::PgConnection; use diesel::prelude::*; use diesel::Connection; +use pageserver_api::controller_api::MetadataHealthRecord; use pageserver_api::controller_api::ShardSchedulingPolicy; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::models::TenantConfig; @@ -90,6 +91,10 @@ pub(crate) enum DatabaseOperation { UpdateTenantShard, DeleteTenant, UpdateTenantConfig, + UpdateMetadataHealth, + ListMetadataHealth, + ListMetadataHealthUnhealthy, + ListMetadataHealthOutdated, } #[must_use] @@ -307,15 +312,32 @@ impl Persistence { &self, shards: Vec, ) -> DatabaseResult<()> { - use crate::schema::tenant_shards::dsl::*; + use crate::schema::metadata_health; + use crate::schema::tenant_shards; + + let now = chrono::Utc::now(); + + let metadata_health_records = shards + .iter() + .map(|t| MetadataHealthPersistence { + tenant_id: t.tenant_id.clone(), + shard_number: t.shard_number, + shard_count: t.shard_count, + healthy: true, + last_scrubbed_at: now, + }) + .collect::>(); + self.with_measured_conn( DatabaseOperation::InsertTenantShards, move |conn| -> DatabaseResult<()> { - for tenant in &shards { - diesel::insert_into(tenant_shards) - .values(tenant) - .execute(conn)?; - } + diesel::insert_into(tenant_shards::table) + .values(&shards) + .execute(conn)?; + + diesel::insert_into(metadata_health::table) + .values(&metadata_health_records) + .execute(conn)?; Ok(()) }, ) @@ -329,10 +351,10 @@ impl Persistence { self.with_measured_conn( DatabaseOperation::DeleteTenant, move |conn| -> DatabaseResult<()> { + // `metadata_health` status (if exists) is also deleted based on the cascade behavior. diesel::delete(tenant_shards) .filter(tenant_id.eq(del_tenant_id.to_string())) .execute(conn)?; - Ok(()) }, ) @@ -675,6 +697,94 @@ impl Persistence { ) .await } + + /// Stores all the latest metadata health updates durably. Updates existing entry on conflict. + /// + /// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller. + #[allow(dead_code)] + pub(crate) async fn update_metadata_health_records( + &self, + healthy_records: Vec, + unhealthy_records: Vec, + now: chrono::DateTime, + ) -> DatabaseResult<()> { + use crate::schema::metadata_health::dsl::*; + + self.with_measured_conn( + DatabaseOperation::UpdateMetadataHealth, + move |conn| -> DatabaseResult<_> { + diesel::insert_into(metadata_health) + .values(&healthy_records) + .on_conflict((tenant_id, shard_number, shard_count)) + .do_update() + .set((healthy.eq(true), last_scrubbed_at.eq(now))) + .execute(conn)?; + + diesel::insert_into(metadata_health) + .values(&unhealthy_records) + .on_conflict((tenant_id, shard_number, shard_count)) + .do_update() + .set((healthy.eq(false), last_scrubbed_at.eq(now))) + .execute(conn)?; + Ok(()) + }, + ) + .await + } + + /// Lists all the metadata health records. + #[allow(dead_code)] + pub(crate) async fn list_metadata_health_records( + &self, + ) -> DatabaseResult> { + self.with_measured_conn( + DatabaseOperation::ListMetadataHealth, + move |conn| -> DatabaseResult<_> { + Ok( + crate::schema::metadata_health::table + .load::(conn)?, + ) + }, + ) + .await + } + + /// Lists all the metadata health records that is unhealthy. + #[allow(dead_code)] + pub(crate) async fn list_unhealthy_metadata_health_records( + &self, + ) -> DatabaseResult> { + use crate::schema::metadata_health::dsl::*; + self.with_measured_conn( + DatabaseOperation::ListMetadataHealthUnhealthy, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::metadata_health::table + .filter(healthy.eq(false)) + .load::(conn)?) + }, + ) + .await + } + + /// Lists all the metadata health records that have not been updated since an `earlier` time. + #[allow(dead_code)] + pub(crate) async fn list_outdated_metadata_health_records( + &self, + earlier: chrono::DateTime, + ) -> DatabaseResult> { + use crate::schema::metadata_health::dsl::*; + + self.with_measured_conn( + DatabaseOperation::ListMetadataHealthOutdated, + move |conn| -> DatabaseResult<_> { + let query = metadata_health.filter(last_scrubbed_at.lt(earlier)); + let res = query.load::(conn)?; + + Ok(res) + }, + ) + .await + } } /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably @@ -744,3 +854,59 @@ pub(crate) struct NodePersistence { pub(crate) listen_pg_addr: String, pub(crate) listen_pg_port: i32, } + +/// Tenant metadata health status that are stored durably. +#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] +#[diesel(table_name = crate::schema::metadata_health)] +pub(crate) struct MetadataHealthPersistence { + #[serde(default)] + pub(crate) tenant_id: String, + #[serde(default)] + pub(crate) shard_number: i32, + #[serde(default)] + pub(crate) shard_count: i32, + + pub(crate) healthy: bool, + pub(crate) last_scrubbed_at: chrono::DateTime, +} + +impl MetadataHealthPersistence { + pub fn new( + tenant_shard_id: TenantShardId, + healthy: bool, + last_scrubbed_at: chrono::DateTime, + ) -> Self { + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_number = tenant_shard_id.shard_number.0 as i32; + let shard_count = tenant_shard_id.shard_count.literal() as i32; + + MetadataHealthPersistence { + tenant_id, + shard_number, + shard_count, + healthy, + last_scrubbed_at, + } + } + + #[allow(dead_code)] + pub(crate) fn get_tenant_shard_id(&self) -> Result { + Ok(TenantShardId { + tenant_id: TenantId::from_str(self.tenant_id.as_str())?, + shard_number: ShardNumber(self.shard_number as u8), + shard_count: ShardCount::new(self.shard_count as u8), + }) + } +} + +impl From for MetadataHealthRecord { + fn from(value: MetadataHealthPersistence) -> Self { + MetadataHealthRecord { + tenant_shard_id: value + .get_tenant_shard_id() + .expect("stored tenant id should be valid"), + healthy: value.healthy, + last_scrubbed_at: value.last_scrubbed_at, + } + } +} diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 886ceae90f..254fdb364e 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -12,6 +12,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; +use utils::failpoint_support; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use utils::lsn::Lsn; @@ -655,11 +656,8 @@ impl Reconciler { // reconcile this location. This includes locations with different configurations, as well // as locations with unknown (None) observed state. - // The general case is to increment the generation. However, there are cases - // where this is not necessary: - // - if we are only updating the TenantConf part of the location - // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale) - // and the location was already in the correct generation + // Incrementing generation is the safe general case, but is inefficient for changes that only + // modify some details (e.g. the tenant's config). let increment_generation = match observed { None => true, Some(ObservedStateLocation { conf: None }) => true, @@ -668,18 +666,11 @@ impl Reconciler { }) => { let generations_match = observed.generation == wanted_conf.generation; - use LocationConfigMode::*; - let mode_transition_requires_gen_inc = - match (observed.mode, wanted_conf.mode) { - // Usually the short-lived attachment modes (multi and stale) are only used - // in the case of [`Self::live_migrate`], but it is simple to handle them correctly - // here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation. - (AttachedSingle, AttachedStale) => false, - (AttachedMulti, AttachedSingle) => false, - (lhs, rhs) => lhs != rhs, - }; - - !generations_match || mode_transition_requires_gen_inc + // We may skip incrementing the generation if the location is already in the expected mode and + // generation. In principle it would also be safe to skip from certain other modes (e.g. AttachedStale), + // but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up + // after a restart/crash, so fall back to the universally safe path of incrementing generation. + !generations_match || (observed.mode != wanted_conf.mode) } }; @@ -749,6 +740,8 @@ impl Reconciler { self.location_config(&node, conf, None, false).await?; } + failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue"); + Ok(()) } diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index ff37d0fe77..cb5ba3f38b 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -1,5 +1,15 @@ // @generated automatically by Diesel CLI. +diesel::table! { + metadata_health (tenant_id, shard_number, shard_count) { + tenant_id -> Varchar, + shard_number -> Int4, + shard_count -> Int4, + healthy -> Bool, + last_scrubbed_at -> Timestamptz, + } +} + diesel::table! { nodes (node_id) { node_id -> Int8, @@ -26,4 +36,4 @@ diesel::table! { } } -diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,); +diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,); diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 2a6d5d3578..e391ce65e6 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -15,7 +15,8 @@ use crate::{ }, compute_hook::NotifyError, id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, - persistence::{AbortShardSplitStatus, TenantFilter}, + metrics::LeadershipStatusGroup, + persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter}, reconciler::{ReconcileError, ReconcileUnits}, scheduler::{MaySchedule, ScheduleContext, ScheduleMode}, tenant_shard::{ @@ -32,11 +33,11 @@ use futures::{stream::FuturesUnordered, StreamExt}; use itertools::Itertools; use pageserver_api::{ controller_api::{ - NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, - ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, - TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, - TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, - TenantShardMigrateResponse, UtilizationScore, + MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, + NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest, + TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, + TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, + TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore, }, models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest}, }; @@ -81,6 +82,9 @@ use crate::{ ReconcilerWaiter, TenantShard, }, }; +use serde::{Deserialize, Serialize}; + +pub mod chaos_injector; // For operations that should be quick, like attaching a new tenant const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); @@ -100,9 +104,13 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); /// How long a node may be unresponsive to heartbeats before we declare it offline. /// This must be long enough to cover node restarts as well as normal operations: in future -/// it should be separated into distinct timeouts for startup vs. normal operation -/// (``) -pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); +pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); + +/// How long a node may be unresponsive to heartbeats during start up before we declare it +/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's +/// handling of the re-attach response may take a long time and blocks heartbeats from +/// being handled on the pageserver side. +pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); #[derive(Clone, strum_macros::Display)] enum TenantOperations { @@ -127,6 +135,24 @@ enum NodeOperations { Delete, } +/// The leadership status for the storage controller process. +/// Allowed transitions are: +/// 1. Leader -> SteppedDown +/// 2. Candidate -> Leader +#[derive(Copy, Clone, strum_macros::Display, measured::FixedCardinalityLabel)] +#[strum(serialize_all = "snake_case")] +pub(crate) enum LeadershipStatus { + /// This is the steady state where the storage controller can produce + /// side effects in the cluster. + Leader, + /// We've been notified to step down by another candidate. No reconciliations + /// take place in this state. + SteppedDown, + /// Initial state for a new storage controller instance. Will attempt to assume leadership. + #[allow(unused)] + Candidate, +} + pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. @@ -136,6 +162,8 @@ const MAX_DELAYED_RECONCILES: usize = 10000; // Top level state available to all HTTP handlers struct ServiceState { + leadership_status: LeadershipStatus, + tenants: BTreeMap, nodes: Arc>, @@ -198,7 +226,21 @@ impl ServiceState { scheduler: Scheduler, delayed_reconcile_rx: tokio::sync::mpsc::Receiver, ) -> Self { + let status = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_leadership_status; + + status.set( + LeadershipStatusGroup { + status: LeadershipStatus::Leader, + }, + 1, + ); + Self { + // TODO: Starting up as Leader is a transient state. Once we enable rolling + // upgrades on the k8s side, we should start up as Candidate. + leadership_status: LeadershipStatus::Leader, tenants, nodes: Arc::new(nodes), scheduler, @@ -216,6 +258,37 @@ impl ServiceState { ) { (&mut self.nodes, &mut self.tenants, &mut self.scheduler) } + + fn get_leadership_status(&self) -> LeadershipStatus { + self.leadership_status + } + + fn step_down(&mut self) { + self.leadership_status = LeadershipStatus::SteppedDown; + + let status = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_leadership_status; + + status.set( + LeadershipStatusGroup { + status: LeadershipStatus::SteppedDown, + }, + 1, + ); + status.set( + LeadershipStatusGroup { + status: LeadershipStatus::Leader, + }, + 0, + ); + status.set( + LeadershipStatusGroup { + status: LeadershipStatus::Candidate, + }, + 0, + ); + } } #[derive(Clone)] @@ -236,7 +309,12 @@ pub struct Config { /// Grace period within which a pageserver does not respond to heartbeats, but is still /// considered active. Once the grace period elapses, the next heartbeat failure will /// mark the pagseserver offline. - pub max_unavailable_interval: Duration, + pub max_offline_interval: Duration, + + /// Extended grace period within which pageserver may not respond to heartbeats. + /// This extended grace period kicks in after the node has been drained for restart + /// and/or upon handling the re-attach request from a node. + pub max_warming_up_interval: Duration, /// How many Reconcilers may be spawned concurrently pub reconciler_concurrency: usize, @@ -269,7 +347,7 @@ pub struct Service { config: Config, persistence: Arc, compute_hook: Arc, - result_tx: tokio::sync::mpsc::UnboundedSender, + result_tx: tokio::sync::mpsc::UnboundedSender, heartbeater: Heartbeater, @@ -299,9 +377,15 @@ pub struct Service { // Process shutdown will fire this token cancel: CancellationToken, + // Child token of [`Service::cancel`] used by reconcilers + reconcilers_cancel: CancellationToken, + // Background tasks will hold this gate gate: Gate, + // Reconcilers background tasks will hold this gate + reconcilers_gate: Gate, + /// This waits for initial reconciliation with pageservers to complete. Until this barrier /// passes, it isn't safe to do any actions that mutate tenants. pub(crate) startup_complete: Barrier, @@ -388,6 +472,30 @@ struct ShardUpdate { generation: Option, } +enum StopReconciliationsReason { + ShuttingDown, + SteppingDown, +} + +impl std::fmt::Display for StopReconciliationsReason { + fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result { + let s = match self { + Self::ShuttingDown => "Shutting down", + Self::SteppingDown => "Stepping down", + }; + write!(writer, "{}", s) + } +} + +pub(crate) enum ReconcileResultRequest { + ReconcileResult(ReconcileResult), + Stop, +} + +// TODO: move this into the storcon peer client when that gets added +#[derive(Serialize, Deserialize, Debug, Default)] +pub(crate) struct GlobalObservedState(HashMap); + impl Service { pub fn get_config(&self) -> &Config { &self.config @@ -587,6 +695,9 @@ impl Service { online_nodes.insert(node_id, utilization); } PageserverState::Offline => {} + PageserverState::WarmingUp { .. } => { + unreachable!("Nodes are never marked warming-up during startup reconcile") + } } } } @@ -741,7 +852,7 @@ impl Service { const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20); let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD); - while !self.cancel.is_cancelled() { + while !self.reconcilers_cancel.is_cancelled() { tokio::select! { _ = interval.tick() => { let reconciles_spawned = self.reconcile_all(); @@ -754,7 +865,7 @@ impl Service { } } } - _ = self.cancel.cancelled() => return + _ = self.reconcilers_cancel.cancelled() => return } } } @@ -779,63 +890,54 @@ impl Service { let res = self.heartbeater.heartbeat(nodes).await; if let Ok(deltas) = res { for (node_id, state) in deltas.0 { - let (new_node, new_availability) = match state { - PageserverState::Available { - utilization, new, .. - } => ( - new, - NodeAvailability::Active(UtilizationScore( - utilization.utilization_score, - )), + let new_availability = match state { + PageserverState::Available { utilization, .. } => NodeAvailability::Active( + UtilizationScore(utilization.utilization_score), ), - PageserverState::Offline => (false, NodeAvailability::Offline), + PageserverState::WarmingUp { started_at } => { + NodeAvailability::WarmingUp(started_at) + } + PageserverState::Offline => { + // The node might have been placed in the WarmingUp state + // while the heartbeat round was on-going. Hence, filter out + // offline transitions for WarmingUp nodes that are still within + // their grace period. + if let Ok(NodeAvailability::WarmingUp(started_at)) = + self.get_node(node_id).await.map(|n| n.get_availability()) + { + let now = Instant::now(); + if now - started_at >= self.config.max_warming_up_interval { + NodeAvailability::Offline + } else { + NodeAvailability::WarmingUp(started_at) + } + } else { + NodeAvailability::Offline + } + } }; - if new_node { - // When the heartbeats detect a newly added node, we don't wish - // to attempt to reconcile the shards assigned to it. The node - // is likely handling it's re-attach response, so reconciling now - // would be counterproductive. - // - // Instead, update the in-memory state with the details learned about the - // node. - let mut locked = self.inner.write().unwrap(); - let (nodes, _tenants, scheduler) = locked.parts_mut(); + // This is the code path for geniune availability transitions (i.e node + // goes unavailable and/or comes back online). + let res = self + .node_configure(node_id, Some(new_availability), None) + .await; - let mut new_nodes = (**nodes).clone(); - - if let Some(node) = new_nodes.get_mut(&node_id) { - node.set_availability(new_availability); - scheduler.node_upsert(node); + match res { + Ok(()) => {} + Err(ApiError::NotFound(_)) => { + // This should be rare, but legitimate since the heartbeats are done + // on a snapshot of the nodes. + tracing::info!("Node {} was not found after heartbeat round", node_id); } - - locked.nodes = Arc::new(new_nodes); - } else { - // This is the code path for geniune availability transitions (i.e node - // goes unavailable and/or comes back online). - let res = self - .node_configure(node_id, Some(new_availability), None) - .await; - - match res { - Ok(()) => {} - Err(ApiError::NotFound(_)) => { - // This should be rare, but legitimate since the heartbeats are done - // on a snapshot of the nodes. - tracing::info!( - "Node {} was not found after heartbeat round", - node_id - ); - } - Err(err) => { - // Transition to active involves reconciling: if a node responds to a heartbeat then - // becomes unavailable again, we may get an error here. - tracing::error!( - "Failed to update node {} after heartbeat round: {}", - node_id, - err - ); - } + Err(err) => { + // Transition to active involves reconciling: if a node responds to a heartbeat then + // becomes unavailable again, we may get an error here. + tracing::error!( + "Failed to update node {} after heartbeat round: {}", + node_id, + err + ); } } } @@ -934,7 +1036,7 @@ impl Service { async fn process_results( &self, - mut result_rx: tokio::sync::mpsc::UnboundedReceiver, + mut result_rx: tokio::sync::mpsc::UnboundedReceiver, mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver< Result<(), (TenantShardId, NotifyError)>, >, @@ -944,8 +1046,8 @@ impl Service { tokio::select! { r = result_rx.recv() => { match r { - Some(result) => {self.process_result(result);}, - None => {break;} + Some(ReconcileResultRequest::ReconcileResult(result)) => {self.process_result(result);}, + None | Some(ReconcileResultRequest::Stop) => {break;} } } _ = async{ @@ -971,9 +1073,6 @@ impl Service { } }; } - - // We should only fall through on shutdown - assert!(self.cancel.is_cancelled()); } async fn process_aborts( @@ -1150,9 +1249,12 @@ impl Service { tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES); let cancel = CancellationToken::new(); + let reconcilers_cancel = cancel.child_token(); + let heartbeater = Heartbeater::new( config.jwt_token.clone(), - config.max_unavailable_interval, + config.max_offline_interval, + config.max_warming_up_interval, cancel.clone(), ); let this = Arc::new(Self { @@ -1174,7 +1276,9 @@ impl Service { abort_tx, startup_complete: startup_complete.clone(), cancel, + reconcilers_cancel, gate: Gate::default(), + reconcilers_gate: Gate::default(), tenant_op_locks: Default::default(), node_op_locks: Default::default(), }); @@ -1664,21 +1768,23 @@ impl Service { | NodeSchedulingPolicy::Filling ); - if !node.is_available() || reset_scheduling { - let mut new_nodes = (**nodes).clone(); - if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) { - if !node.is_available() { - node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); - } - - if reset_scheduling { - node.set_scheduling(NodeSchedulingPolicy::Active); - } - - scheduler.node_upsert(node); - let new_nodes = Arc::new(new_nodes); - *nodes = new_nodes; + let mut new_nodes = (**nodes).clone(); + if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) { + if reset_scheduling { + node.set_scheduling(NodeSchedulingPolicy::Active); } + + tracing::info!("Marking {} warming-up on reattach", reattach_req.node_id); + node.set_availability(NodeAvailability::WarmingUp(std::time::Instant::now())); + + scheduler.node_upsert(node); + let new_nodes = Arc::new(new_nodes); + *nodes = new_nodes; + } else { + tracing::error!( + "Reattaching node {} was removed while processing the request", + reattach_req.node_id + ); } } @@ -2848,7 +2954,6 @@ impl Service { } // no shard needs to go first/last; the operation should be idempotent - // TODO: it would be great to ensure that all shards return the same error let mut results = self .tenant_for_shards(targets, |tenant_shard_id, node| { futures::FutureExt::boxed(detach_one( @@ -2867,6 +2972,7 @@ impl Service { .filter(|(_, res)| res != &any.1) .collect::>(); if !mismatching.is_empty() { + // this can be hit by races which should not happen because operation lock on cplane let matching = results.len() - mismatching.len(); tracing::error!( matching, @@ -4719,6 +4825,15 @@ impl Service { // TODO: in the background, we should balance work back onto this pageserver } + // No action required for the intermediate unavailable state. + // When we transition into active or offline from the unavailable state, + // the correct handling above will kick in. + AvailabilityTransition::ToWarmingUpFromActive => { + tracing::info!("Node {} transition to unavailable from active", node_id); + } + AvailabilityTransition::ToWarmingUpFromOffline => { + tracing::info!("Node {} transition to unavailable from offline", node_id); + } AvailabilityTransition::Unchanged => { tracing::debug!("Node {} no availability change during config", node_id); } @@ -5117,7 +5232,7 @@ impl Service { } }; - let Ok(gate_guard) = self.gate.enter() else { + let Ok(gate_guard) = self.reconcilers_gate.enter() else { // Gate closed: we're shutting down, drop out. return None; }; @@ -5130,7 +5245,7 @@ impl Service { &self.persistence, units, gate_guard, - &self.cancel, + &self.reconcilers_cancel, ) } @@ -5576,18 +5691,27 @@ impl Service { Ok(std::cmp::max(waiter_count, reconciles_spawned)) } + async fn stop_reconciliations(&self, reason: StopReconciliationsReason) { + // Cancel all on-going reconciles and wait for them to exit the gate. + tracing::info!("{reason}: cancelling and waiting for in-flight reconciles"); + self.reconcilers_cancel.cancel(); + self.reconcilers_gate.close().await; + + // Signal the background loop in [`Service::process_results`] to exit once + // it has proccessed the results from all the reconciles we cancelled earlier. + tracing::info!("{reason}: processing results from previously in-flight reconciles"); + self.result_tx.send(ReconcileResultRequest::Stop).ok(); + self.result_tx.closed().await; + } + pub async fn shutdown(&self) { - // Note that this already stops processing any results from reconciles: so - // we do not expect that our [`TenantShard`] objects will reach a neat - // final state. + self.stop_reconciliations(StopReconciliationsReason::ShuttingDown) + .await; + + // Background tasks hold gate guards: this notifies them of the cancellation and + // waits for them all to complete. + tracing::info!("Shutting down: cancelling and waiting for background tasks to exit"); self.cancel.cancel(); - - // The cancellation tokens in [`crate::reconciler::Reconciler`] are children - // of our cancellation token, so we do not need to explicitly cancel each of - // them. - - // Background tasks and reconcilers hold gate guards: this waits for them all - // to complete. self.gate.close().await; } @@ -5972,4 +6096,89 @@ impl Service { Ok(()) } + + /// Updates scrubber metadata health check results. + pub(crate) async fn metadata_health_update( + &self, + update_req: MetadataHealthUpdateRequest, + ) -> Result<(), ApiError> { + let now = chrono::offset::Utc::now(); + let (healthy_records, unhealthy_records) = { + let locked = self.inner.read().unwrap(); + let healthy_records = update_req + .healthy_tenant_shards + .into_iter() + // Retain only health records associated with tenant shards managed by storage controller. + .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id)) + .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now)) + .collect(); + let unhealthy_records = update_req + .unhealthy_tenant_shards + .into_iter() + .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id)) + .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now)) + .collect(); + + (healthy_records, unhealthy_records) + }; + + self.persistence + .update_metadata_health_records(healthy_records, unhealthy_records, now) + .await?; + Ok(()) + } + + /// Lists the tenant shards that has unhealthy metadata status. + pub(crate) async fn metadata_health_list_unhealthy( + &self, + ) -> Result, ApiError> { + let result = self + .persistence + .list_unhealthy_metadata_health_records() + .await? + .iter() + .map(|p| p.get_tenant_shard_id().unwrap()) + .collect(); + + Ok(result) + } + + /// Lists the tenant shards that have not been scrubbed for some duration. + pub(crate) async fn metadata_health_list_outdated( + &self, + not_scrubbed_for: Duration, + ) -> Result, ApiError> { + let earlier = chrono::offset::Utc::now() - not_scrubbed_for; + let result = self + .persistence + .list_outdated_metadata_health_records(earlier) + .await? + .into_iter() + .map(|record| record.into()) + .collect(); + Ok(result) + } + + pub(crate) fn get_leadership_status(&self) -> LeadershipStatus { + self.inner.read().unwrap().get_leadership_status() + } + + pub(crate) async fn step_down(&self) -> GlobalObservedState { + tracing::info!("Received step down request from peer"); + + self.inner.write().unwrap().step_down(); + // TODO: would it make sense to have a time-out for this? + self.stop_reconciliations(StopReconciliationsReason::SteppingDown) + .await; + + let mut global_observed = GlobalObservedState::default(); + let locked = self.inner.read().unwrap(); + for (tid, tenant_shard) in locked.tenants.iter() { + global_observed + .0 + .insert(*tid, tenant_shard.observed.clone()); + } + + global_observed + } } diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs new file mode 100644 index 0000000000..99961d691c --- /dev/null +++ b/storage_controller/src/service/chaos_injector.rs @@ -0,0 +1,71 @@ +use std::{sync::Arc, time::Duration}; + +use rand::seq::SliceRandom; +use rand::thread_rng; +use tokio_util::sync::CancellationToken; + +use super::Service; + +pub struct ChaosInjector { + service: Arc, + interval: Duration, +} + +impl ChaosInjector { + pub fn new(service: Arc, interval: Duration) -> Self { + Self { service, interval } + } + + pub async fn run(&mut self, cancel: CancellationToken) { + let mut interval = tokio::time::interval(self.interval); + + loop { + tokio::select! { + _ = interval.tick() => {} + _ = cancel.cancelled() => { + tracing::info!("Shutting down"); + return; + } + } + + self.inject_chaos().await; + + tracing::info!("Chaos iteration..."); + } + } + + async fn inject_chaos(&mut self) { + // Pick some shards to interfere with + let batch_size = 128; + let mut inner = self.service.inner.write().unwrap(); + let (nodes, tenants, scheduler) = inner.parts_mut(); + let tenant_ids = tenants.keys().cloned().collect::>(); + let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size); + + for victim in victims { + let shard = tenants + .get_mut(victim) + .expect("Held lock between choosing ID and this get"); + + // Pick a secondary to promote + let Some(new_location) = shard + .intent + .get_secondary() + .choose(&mut thread_rng()) + .cloned() + else { + tracing::info!("Skipping shard {victim}: no secondary location, can't migrate"); + continue; + }; + + let Some(old_location) = *shard.intent.get_attached() else { + tracing::info!("Skipping shard {victim}: currently has no attached location"); + continue; + }; + + shard.intent.demote_attached(scheduler, old_location); + shard.intent.promote_attached(scheduler, new_location); + self.service.maybe_reconcile_shard(shard, nodes); + } + } +} diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index ee2ba6c4ee..e250f29f98 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -9,6 +9,7 @@ use crate::{ persistence::TenantShardPersistence, reconciler::ReconcileUnits, scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext}, + service::ReconcileResultRequest, }; use pageserver_api::controller_api::{ NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, @@ -17,7 +18,7 @@ use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, shard::{ShardIdentity, TenantShardId}, }; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::{instrument, Instrument}; @@ -283,7 +284,7 @@ impl Drop for IntentState { } } -#[derive(Default, Clone, Serialize)] +#[derive(Default, Clone, Serialize, Deserialize, Debug)] pub(crate) struct ObservedState { pub(crate) locations: HashMap, } @@ -297,7 +298,7 @@ pub(crate) struct ObservedState { /// what it is (e.g. we failed partway through configuring it) /// * Instance exists with conf==Some: this tells us what we last successfully configured on this node, /// and that configuration will still be present unless something external interfered. -#[derive(Clone, Serialize)] +#[derive(Clone, Serialize, Deserialize, Debug)] pub(crate) struct ObservedStateLocation { /// If None, it means we do not know the status of this shard's location on this node, but /// we know that we might have some state on this node. @@ -1059,7 +1060,7 @@ impl TenantShard { #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( &mut self, - result_tx: &tokio::sync::mpsc::UnboundedSender, + result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, service_config: &service::Config, @@ -1183,7 +1184,9 @@ impl TenantShard { pending_compute_notification: reconciler.compute_notify_failure, }; - result_tx.send(result).ok(); + result_tx + .send(ReconcileResultRequest::ReconcileResult(result)) + .ok(); } .instrument(reconciler_span), ); diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index 7d5b7d10b9..d19119990b 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -10,6 +10,7 @@ aws-smithy-async.workspace = true either.workspace = true tokio-rustls.workspace = true anyhow.workspace = true +git-version.workspace = true hex.workspace = true humantime.workspace = true thiserror.workspace = true diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 421a848f67..35ec69fd50 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -40,6 +40,11 @@ impl TimelineAnalysis { garbage_keys: Vec::new(), } } + + /// Whether a timeline is healthy. + pub(crate) fn is_healthy(&self) -> bool { + self.errors.is_empty() && self.warnings.is_empty() + } } pub(crate) async fn branch_cleanup_and_check_errors( @@ -87,7 +92,8 @@ pub(crate) async fn branch_cleanup_and_check_errors( .push(format!("index_part.json version: {}", index_part.version())) } - if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() { + let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3); + if !newest_versions.any(|ip| ip == &index_part.version()) { info!( "index_part.json version is not latest: {}", index_part.version() @@ -166,8 +172,11 @@ pub(crate) async fn branch_cleanup_and_check_errors( } } BlobDataParseResult::Relic => {} - BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( - parse_errors + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => result.errors.extend( + errors .into_iter() .map(|error| format!("parse error: {error}")), ), @@ -294,7 +303,10 @@ pub(crate) enum BlobDataParseResult { }, /// The remains of a deleted Timeline (i.e. an initdb archive only) Relic, - Incorrect(Vec), + Incorrect { + errors: Vec, + s3_layers: HashSet<(LayerName, Generation)>, + }, } pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { @@ -437,7 +449,7 @@ pub(crate) async fn list_timeline_blobs( } Ok(S3TimelineBlobData { - blob_data: BlobDataParseResult::Incorrect(errors), + blob_data: BlobDataParseResult::Incorrect { errors, s3_layers }, unused_index_keys: index_part_keys, unknown_keys, }) diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs index 2ef802229d..f5bb7e088a 100644 --- a/storage_scrubber/src/find_large_objects.rs +++ b/storage_scrubber/src/find_large_objects.rs @@ -1,10 +1,13 @@ +use std::pin::pin; + use futures::{StreamExt, TryStreamExt}; use pageserver::tenant::storage_layer::LayerName; +use remote_storage::ListingMode; use serde::{Deserialize, Serialize}; use crate::{ - checks::parse_layer_object_name, init_remote, list_objects_with_retries, - metadata_stream::stream_tenants, BucketConfig, NodeKind, + checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic, + stream_objects_with_retries, BucketConfig, NodeKind, }; #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -47,45 +50,38 @@ pub async fn find_large_objects( ignore_deltas: bool, concurrency: usize, ) -> anyhow::Result { - let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; - let tenants = std::pin::pin!(stream_tenants(&s3_client, &target)); + let (remote_client, target) = + init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?; + let tenants = pin!(stream_tenants_generic(&remote_client, &target)); let objects_stream = tenants.map_ok(|tenant_shard_id| { let mut tenant_root = target.tenant_root(&tenant_shard_id); - let s3_client = s3_client.clone(); + let remote_client = remote_client.clone(); async move { let mut objects = Vec::new(); let mut total_objects_ctr = 0u64; // We want the objects and not just common prefixes tenant_root.delimiter.clear(); - let mut continuation_token = None; - loop { - let fetch_response = - list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone()) - .await?; - for obj in fetch_response.contents().iter().filter(|o| { - if let Some(obj_size) = o.size { - min_size as i64 <= obj_size - } else { - false - } - }) { - let key = obj.key().expect("couldn't get key").to_owned(); + let mut objects_stream = pin!(stream_objects_with_retries( + &remote_client, + ListingMode::NoDelimiter, + &tenant_root + )); + while let Some(listing) = objects_stream.next().await { + let listing = listing?; + for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) { + let key = obj.key.to_string(); let kind = LargeObjectKind::from_key(&key); if ignore_deltas && kind == LargeObjectKind::DeltaLayer { continue; } objects.push(LargeObject { key, - size: obj.size.unwrap() as u64, + size: obj.size, kind, }) } - total_objects_ctr += fetch_response.contents().len() as u64; - match fetch_response.next_continuation_token { - Some(new_token) => continuation_token = Some(new_token), - None => break, - } + total_objects_ctr += listing.keys.len() as u64; } Ok((tenant_shard_id, objects, total_objects_ctr)) diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index c7e21d7e26..d6a73bf366 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -5,12 +5,13 @@ use std::{ collections::{HashMap, HashSet}, sync::Arc, + time::Duration, }; use anyhow::Context; use futures_util::TryStreamExt; use pageserver_api::shard::TenantShardId; -use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath}; +use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; @@ -18,8 +19,8 @@ use utils::id::TenantId; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, - init_remote, init_remote_generic, - metadata_stream::{stream_tenant_timelines, stream_tenants}, + init_remote_generic, list_objects_with_retries_generic, + metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic}, BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, }; @@ -27,6 +28,11 @@ use crate::{ enum GarbageReason { DeletedInConsole, MissingInConsole, + + // The remaining data relates to a known deletion issue, and we're sure that purging this + // will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where + // there is nothing in a tenant path apart from a heatmap file. + KnownBug, } #[derive(Serialize, Deserialize, Debug)] @@ -72,6 +78,15 @@ impl GarbageList { } } + /// If an entity has been identified as requiring purge due to a known bug, e.g. + /// a particular type of object left behind after an incomplete deletion. + fn append_buggy(&mut self, entity: GarbageEntity) { + self.items.push(GarbageItem { + entity, + reason: GarbageReason::KnownBug, + }); + } + /// Return true if appended, false if not. False means the result was not garbage. fn maybe_append(&mut self, entity: GarbageEntity, result: Option) -> bool where @@ -138,7 +153,7 @@ async fn find_garbage_inner( node_kind: NodeKind, ) -> anyhow::Result { // Construct clients for S3 and for Console API - let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?; + let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?; let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config)); // Build a set of console-known tenants, for quickly eliminating known-active tenants without having @@ -164,7 +179,7 @@ async fn find_garbage_inner( // Enumerate Tenants in S3, and check if each one exists in Console tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket); - let tenants = stream_tenants(&s3_client, &target); + let tenants = stream_tenants_generic(&remote_client, &target); let tenants_checked = tenants.map_ok(|t| { let api_client = cloud_admin_api_client.clone(); let console_cache = console_cache.clone(); @@ -219,6 +234,66 @@ async fn find_garbage_inner( assert!(project.tenant == tenant_shard_id.tenant_id); } + // Special case: If it's missing in console, check for known bugs that would enable us to conclusively + // identify it as purge-able anyway + if console_result.is_none() { + let timelines = + stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id) + .await? + .collect::>() + .await; + if timelines.is_empty() { + // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps + let tenant_objects = list_objects_with_retries_generic( + &remote_client, + ListingMode::WithDelimiter, + &target.tenant_root(&tenant_shard_id), + ) + .await?; + let object = tenant_objects.keys.first().unwrap(); + if object.key.get_path().as_str().ends_with("heatmap-v1.json") { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); + garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); + continue; + } else { + tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + } + } else { + // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial + // rollout of WAL DR in which we never deleted these. + let mut any_non_initdb = false; + + for timeline_r in timelines { + let timeline = timeline_r?; + let timeline_objects = list_objects_with_retries_generic( + &remote_client, + ListingMode::WithDelimiter, + &target.timeline_root(&timeline), + ) + .await?; + if !timeline_objects.prefixes.is_empty() { + // Sub-paths? Unexpected + any_non_initdb = true; + } else { + let object = timeline_objects.keys.first().unwrap(); + if object.key.get_path().as_str().ends_with("initdb.tar.zst") { + tracing::info!("Timeline {timeline} contains only initdb.tar.zst"); + } else { + any_non_initdb = true; + } + } + } + + if any_non_initdb { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb"); + } else { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb"); + garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); + continue; + } + } + } + if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) { tracing::debug!("Tenant {tenant_shard_id} is garbage"); } else { @@ -256,7 +331,8 @@ async fn find_garbage_inner( // Construct a stream of all timelines within active tenants let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok)); - let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t)); + let timelines = + active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t)); let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY); let timelines = timelines.try_flatten(); @@ -324,7 +400,7 @@ impl std::fmt::Display for PurgeMode { pub async fn get_tenant_objects( s3_client: &GenericRemoteStorage, tenant_shard_id: TenantShardId, -) -> anyhow::Result> { +) -> anyhow::Result> { tracing::debug!("Listing objects in tenant {tenant_shard_id}"); let tenant_root = super::remote_tenant_path(&tenant_shard_id); @@ -345,13 +421,10 @@ pub async fn get_tenant_objects( pub async fn get_timeline_objects( s3_client: &GenericRemoteStorage, ttid: TenantShardTimelineId, -) -> anyhow::Result> { +) -> anyhow::Result> { tracing::debug!("Listing objects in timeline {ttid}"); let timeline_root = super::remote_timeline_path_id(&ttid); - // TODO: apply extra validation based on object modification time. Don't purge - // timelines whose index_part.json has been touched recently. - let list = s3_client .list( Some(&timeline_root), @@ -372,7 +445,7 @@ const MAX_KEYS_PER_DELETE: usize = 1000; /// `num_deleted` returns number of deleted keys. async fn do_delete( remote_client: &GenericRemoteStorage, - keys: &mut Vec, + keys: &mut Vec, dry_run: bool, drain: bool, progress_tracker: &mut DeletionProgressTracker, @@ -382,6 +455,8 @@ async fn do_delete( let request_keys = keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len()))); + let request_keys: Vec = request_keys.into_iter().map(|o| o.key).collect(); + let num_deleted = request_keys.len(); if dry_run { tracing::info!("Dry-run deletion of objects: "); @@ -420,6 +495,7 @@ impl DeletionProgressTracker { pub async fn purge_garbage( input_path: String, mode: PurgeMode, + min_age: Duration, dry_run: bool, ) -> anyhow::Result<()> { let list_bytes = tokio::fs::read(&input_path).await?; @@ -430,7 +506,7 @@ pub async fn purge_garbage( input_path ); - let remote_client = + let (remote_client, _target) = init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?; assert_eq!( @@ -457,6 +533,7 @@ pub async fn purge_garbage( .filter(|i| match (&mode, &i.reason) { (PurgeMode::DeletedAndMissing, _) => true, (PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true, + (PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true, (PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false, }); @@ -485,6 +562,37 @@ pub async fn purge_garbage( let mut progress_tracker = DeletionProgressTracker::default(); while let Some(result) = get_objects_results.next().await { let mut object_list = result?; + + // Extra safety check: even if a collection of objects is garbage, check max() of modification + // times before purging, so that if we incorrectly marked a live tenant as garbage then we would + // notice that its index has been written recently and would omit deleting it. + if object_list.is_empty() { + // Simplify subsequent code by ensuring list always has at least one item + // Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes + continue; + } + let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap(); + let age = max_mtime.elapsed(); + match age { + Err(_) => { + tracing::warn!("Bad last_modified time"); + continue; + } + Ok(a) if a < min_age => { + // Failed age check. This doesn't mean we did something wrong: a tenant might really be garbage and recently + // written, but out of an abundance of caution we still don't purge it. + tracing::info!( + "Skipping tenant with young objects {}..{}", + object_list.first().as_ref().unwrap().key, + object_list.last().as_ref().unwrap().key + ); + continue; + } + Ok(_) => { + // Passed age check + } + } + objects_to_delete.append(&mut object_list); if objects_to_delete.len() >= MAX_KEYS_PER_DELETE { do_delete( diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 5c64e7e459..1fc94cc174 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -16,22 +16,26 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{anyhow, Context}; +use aws_config::retry::{RetryConfigBuilder, RetryMode}; use aws_sdk_s3::config::Region; use aws_sdk_s3::error::DisplayErrorContext; use aws_sdk_s3::Client; use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; +use futures::{Stream, StreamExt}; use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver::tenant::TENANTS_SEGMENT_NAME; use pageserver_api::shard::TenantShardId; use remote_storage::{ - GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, - DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind, + S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; use reqwest::Url; use serde::{Deserialize, Serialize}; +use storage_controller_client::control_api; use tokio::io::AsyncReadExt; +use tokio_util::sync::CancellationToken; use tracing::error; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; @@ -253,6 +257,12 @@ pub struct ControllerClientConfig { pub controller_jwt: String, } +impl ControllerClientConfig { + pub fn build_client(self) -> control_api::Client { + control_api::Client::new(self.controller_api, Some(self.controller_jwt)) + } +} + pub struct ConsoleConfig { pub token: String, pub base_url: Url, @@ -305,8 +315,15 @@ pub fn init_logging(file_name: &str) -> Option { } async fn init_s3_client(bucket_region: Region) -> Client { + let mut retry_config_builder = RetryConfigBuilder::new(); + + retry_config_builder + .set_max_attempts(Some(3)) + .set_mode(Some(RetryMode::Adaptive)); + let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28()) .region(bucket_region) + .retry_config(retry_config_builder.build()) .load() .await; Client::new(&config) @@ -319,27 +336,35 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str { } } +fn make_root_target( + bucket_name: String, + prefix_in_bucket: String, + node_kind: NodeKind, +) -> RootTarget { + let s3_target = S3Target { + bucket_name, + prefix_in_bucket, + delimiter: "/".to_string(), + }; + match node_kind { + NodeKind::Pageserver => RootTarget::Pageserver(s3_target), + NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target), + } +} + async fn init_remote( bucket_config: BucketConfig, node_kind: NodeKind, ) -> anyhow::Result<(Arc, RootTarget)> { let bucket_region = Region::new(bucket_config.region); - let delimiter = "/".to_string(); let s3_client = Arc::new(init_s3_client(bucket_region).await); let default_prefix = default_prefix_in_bucket(node_kind).to_string(); - let s3_root = match node_kind { - NodeKind::Pageserver => RootTarget::Pageserver(S3Target { - bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix), - delimiter, - }), - NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target { - bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix), - delimiter, - }), - }; + let s3_root = make_root_target( + bucket_config.bucket, + bucket_config.prefix_in_bucket.unwrap_or(default_prefix), + node_kind, + ); Ok((s3_client, s3_root)) } @@ -347,12 +372,12 @@ async fn init_remote( async fn init_remote_generic( bucket_config: BucketConfig, node_kind: NodeKind, -) -> anyhow::Result { +) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> { let endpoint = env::var("AWS_ENDPOINT_URL").ok(); let default_prefix = default_prefix_in_bucket(node_kind).to_string(); let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix)); let storage = S3Config { - bucket_name: bucket_config.bucket, + bucket_name: bucket_config.bucket.clone(), bucket_region: bucket_config.region, prefix_in_bucket, endpoint, @@ -366,7 +391,13 @@ async fn init_remote_generic( storage: RemoteStorageKind::AwsS3(storage), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; - GenericRemoteStorage::from_config(&storage_config).await + + // We already pass the prefix to the remote client above + let prefix_in_root_target = String::new(); + let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind); + + let client = GenericRemoteStorage::from_config(&storage_config).await?; + Ok((client, s3_root)) } async fn list_objects_with_retries( @@ -404,6 +435,84 @@ async fn list_objects_with_retries( Err(anyhow!("unreachable unless MAX_RETRIES==0")) } +/// Listing possibly large amounts of keys in a streaming fashion. +fn stream_objects_with_retries<'a>( + storage_client: &'a GenericRemoteStorage, + listing_mode: ListingMode, + s3_target: &'a S3Target, +) -> impl Stream> + 'a { + async_stream::stream! { + let mut trial = 0; + let cancel = CancellationToken::new(); + let prefix_str = &s3_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&s3_target.prefix_in_bucket); + let prefix = RemotePath::from_string(prefix_str)?; + let mut list_stream = + storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel); + while let Some(res) = list_stream.next().await { + if let Err(err) = res { + let yield_err = if err.is_permanent() { + true + } else { + let backoff_time = 1 << trial.max(5); + tokio::time::sleep(Duration::from_secs(backoff_time)).await; + trial += 1; + trial == MAX_RETRIES - 1 + }; + if yield_err { + yield Err(err) + .with_context(|| format!("Failed to list objects {MAX_RETRIES} times")); + break; + } + } else { + trial = 0; + yield res.map_err(anyhow::Error::from); + } + } + } +} + +/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes, +/// use [`stream_objects_with_retries`] instead. +async fn list_objects_with_retries_generic( + remote_client: &GenericRemoteStorage, + listing_mode: ListingMode, + s3_target: &S3Target, +) -> anyhow::Result { + let cancel = CancellationToken::new(); + let prefix_str = &s3_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&s3_target.prefix_in_bucket); + let prefix = RemotePath::from_string(prefix_str)?; + for trial in 0..MAX_RETRIES { + match remote_client + .list(Some(&prefix), listing_mode, None, &cancel) + .await + { + Ok(response) => return Ok(response), + Err(e) => { + if trial == MAX_RETRIES - 1 { + return Err(e) + .with_context(|| format!("Failed to list objects {MAX_RETRIES} times")); + } + error!( + "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}", + s3_target.bucket_name, + s3_target.prefix_in_bucket, + s3_target.delimiter, + DisplayErrorContext(e), + ); + let backoff_time = 1 << trial.max(5); + tokio::time::sleep(Duration::from_secs(backoff_time)).await; + } + } + } + panic!("MAX_RETRIES is not allowed to be 0"); +} + async fn download_object_with_retries( s3_client: &Client, bucket_name: &str, diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index b3ed6f6451..cbc836755a 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,7 +1,8 @@ use anyhow::{anyhow, bail}; use camino::Utf8PathBuf; +use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; use pageserver_api::shard::TenantShardId; -use reqwest::Url; +use reqwest::{Method, Url}; use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use storage_scrubber::pageserver_physical_gc::GcMode; use storage_scrubber::scan_pageserver_metadata::scan_metadata; @@ -16,6 +17,11 @@ use storage_scrubber::{ use clap::{Parser, Subcommand}; use utils::id::TenantId; +use utils::{project_build_tag, project_git_version}; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + #[derive(Parser)] #[command(author, version, about, long_about = None)] #[command(arg_required_else_help(true))] @@ -50,6 +56,8 @@ enum Command { input_path: String, #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)] mode: PurgeMode, + #[arg(long = "min-age")] + min_age: humantime::Duration, }, #[command(verbatim_doc_comment)] ScanMetadata { @@ -59,6 +67,8 @@ enum Command { json: bool, #[arg(long = "tenant-id", num_args = 0..)] tenant_ids: Vec, + #[arg(long = "post", default_value_t = false)] + post_to_storage_controller: bool, #[arg(long, default_value = None)] /// For safekeeper node_kind only, points to db with debug dump dump_db_connstr: Option, @@ -96,6 +106,8 @@ enum Command { async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); + tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG); + let bucket_config = BucketConfig::from_env()?; let command_log_name = match &cli.command { @@ -114,11 +126,20 @@ async fn main() -> anyhow::Result<()> { chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") )); + let controller_client_conf = cli.controller_api.map(|controller_api| { + ControllerClientConfig { + controller_api, + // Default to no key: this is a convenience when working in a development environment + controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()), + } + }); + match cli.command { Command::ScanMetadata { json, tenant_ids, node_kind, + post_to_storage_controller, dump_db_connstr, dump_db_table, } => { @@ -157,6 +178,9 @@ async fn main() -> anyhow::Result<()> { } Ok(()) } else { + if controller_client_conf.is_none() && post_to_storage_controller { + return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run")); + } match scan_metadata(bucket_config.clone(), tenant_ids).await { Err(e) => { tracing::error!("Failed: {e}"); @@ -168,22 +192,37 @@ async fn main() -> anyhow::Result<()> { } else { println!("{}", summary.summary_string()); } + + if post_to_storage_controller { + if let Some(conf) = controller_client_conf { + let controller_client = conf.build_client(); + let body = summary.build_health_update_request(); + controller_client + .dispatch::( + Method::POST, + "control/v1/metadata_health/update".to_string(), + Some(body), + ) + .await?; + } + } + if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) + tracing::error!("Fatal scrub errors detected"); } else if summary.is_empty() { // Strictly speaking an empty bucket is a valid bucket, but if someone ran the // scrubber they were likely expecting to scan something, and if we see no timelines // at all then it's likely due to some configuration issues like a bad prefix - Err(anyhow::anyhow!( + tracing::error!( "No timelines found in bucket {} prefix {}", bucket_config.bucket, bucket_config .prefix_in_bucket .unwrap_or("".to_string()) - )) - } else { - Ok(()) + ); } + + Ok(()) } } } @@ -196,9 +235,11 @@ async fn main() -> anyhow::Result<()> { let console_config = ConsoleConfig::from_env()?; find_garbage(bucket_config, console_config, depth, node_kind, output_path).await } - Command::PurgeGarbage { input_path, mode } => { - purge_garbage(input_path, mode, !cli.delete).await - } + Command::PurgeGarbage { + input_path, + mode, + min_age, + } => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await, Command::TenantSnapshot { tenant_id, output_path, @@ -213,14 +254,6 @@ async fn main() -> anyhow::Result<()> { min_age, mode, } => { - let controller_client_conf = cli.controller_api.map(|controller_api| { - ControllerClientConfig { - controller_api, - // Default to no key: this is a convenience when working in a development environment - controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()), - } - }); - match (&controller_client_conf, mode) { (Some(_), _) => { // Any mode may run when controller API is set diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs index c05874f556..54812ffc94 100644 --- a/storage_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -1,12 +1,41 @@ -use anyhow::Context; +use std::str::FromStr; + +use anyhow::{anyhow, Context}; use async_stream::{stream, try_stream}; use aws_sdk_s3::{types::ObjectIdentifier, Client}; +use futures::StreamExt; +use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; use tokio_stream::Stream; -use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId}; +use crate::{ + list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target, + TenantShardTimelineId, +}; use pageserver_api::shard::TenantShardId; use utils::id::{TenantId, TimelineId}; +/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes +pub fn stream_tenants_generic<'a>( + remote_client: &'a GenericRemoteStorage, + target: &'a RootTarget, +) -> impl Stream> + 'a { + try_stream! { + let tenants_target = target.tenants_root(); + let mut tenants_stream = + std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target)); + while let Some(chunk) = tenants_stream.next().await { + let chunk = chunk?; + let entry_ids = chunk.prefixes.iter() + .map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'"))); + for dir_name_res in entry_ids { + let dir_name = dir_name_res?; + let id = TenantShardId::from_str(dir_name)?; + yield id; + } + } + } +} + /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2 pub fn stream_tenants<'a>( s3_client: &'a Client, @@ -160,6 +189,63 @@ pub async fn stream_tenant_timelines<'a>( }) } +/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered +/// using a listing. The listing is done before the stream is built, so that this +/// function can be used to generate concurrency on a stream using buffer_unordered. +pub async fn stream_tenant_timelines_generic<'a>( + remote_client: &'a GenericRemoteStorage, + target: &'a RootTarget, + tenant: TenantShardId, +) -> anyhow::Result> + 'a> { + let mut timeline_ids: Vec> = Vec::new(); + let timelines_target = target.timelines_root(&tenant); + + let mut objects_stream = std::pin::pin!(stream_objects_with_retries( + remote_client, + ListingMode::WithDelimiter, + &timelines_target + )); + loop { + tracing::debug!("Listing in {tenant}"); + let fetch_response = match objects_stream.next().await { + None => break, + Some(Err(e)) => { + timeline_ids.push(Err(e)); + break; + } + Some(Ok(r)) => r, + }; + + let new_entry_ids = fetch_response + .prefixes + .iter() + .filter_map(|prefix| -> Option<&str> { + prefix + .get_path() + .as_str() + .strip_prefix(&timelines_target.prefix_in_bucket)? + .strip_suffix('/') + }) + .map(|entry_id_str| { + entry_id_str + .parse::() + .with_context(|| format!("Incorrect entry id str: {entry_id_str}")) + }); + + for i in new_entry_ids { + timeline_ids.push(i); + } + } + + tracing::debug!("Yielding for {}", tenant); + Ok(stream! { + for i in timeline_ids { + let id = i?; + yield Ok(TenantShardTimelineId::new(tenant, id)); + } + }) +} + pub(crate) fn stream_listing<'a>( s3_client: &'a Client, target: &'a S3Target, @@ -190,3 +276,33 @@ pub(crate) fn stream_listing<'a>( } } } + +pub(crate) fn stream_listing_generic<'a>( + remote_client: &'a GenericRemoteStorage, + target: &'a S3Target, +) -> impl Stream)>> + 'a { + let listing_mode = if target.delimiter.is_empty() { + ListingMode::NoDelimiter + } else { + ListingMode::WithDelimiter + }; + try_stream! { + let mut objects_stream = std::pin::pin!(stream_objects_with_retries( + remote_client, + listing_mode, + target, + )); + while let Some(list) = objects_stream.next().await { + let list = list?; + if target.delimiter.is_empty() { + for key in list.keys { + yield (key.key.clone(), Some(key)); + } + } else { + for key in list.prefixes { + yield (key, None); + } + } + } + } +} diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index e977fd49f7..ff230feae3 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -389,10 +389,13 @@ async fn gc_ancestor( // Post-deletion tenant location: don't try and GC it. continue; } - BlobDataParseResult::Incorrect(reasons) => { + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, // TODO(yuchen): could still check references to these s3 layers? + } => { // Our primary purpose isn't to report on bad data, but log this rather than skipping silently tracing::warn!( - "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}" + "Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}" ); continue; } @@ -518,9 +521,12 @@ pub async fn pageserver_physical_gc( // Post-deletion tenant location: don't try and GC it. return Ok(summary); } - BlobDataParseResult::Incorrect(reasons) => { + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => { // Our primary purpose isn't to report on bad data, but log this rather than skipping silently - tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}"); + tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}"); return Ok(summary); } }; @@ -567,13 +573,7 @@ pub async fn pageserver_physical_gc( } // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC - let Some(controller_client) = controller_client_conf.as_ref().map(|c| { - let ControllerClientConfig { - controller_api, - controller_jwt, - } = c; - control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone())) - }) else { + let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else { tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified"); return Ok(summary); }; diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index fbd60f93bb..b9630056e1 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -9,12 +9,13 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline use aws_sdk_s3::Client; use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::remote_layer_path; +use pageserver_api::controller_api::MetadataHealthUpdateRequest; use pageserver_api::shard::TenantShardId; use serde::Serialize; use utils::id::TenantId; use utils::shard::ShardCount; -#[derive(Serialize)] +#[derive(Serialize, Default)] pub struct MetadataSummary { tenant_count: usize, timeline_count: usize, @@ -23,19 +24,16 @@ pub struct MetadataSummary { with_warnings: HashSet, with_orphans: HashSet, indices_by_version: HashMap, + + #[serde(skip)] + pub(crate) healthy_tenant_shards: HashSet, + #[serde(skip)] + pub(crate) unhealthy_tenant_shards: HashSet, } impl MetadataSummary { fn new() -> Self { - Self { - tenant_count: 0, - timeline_count: 0, - timeline_shard_count: 0, - with_errors: HashSet::new(), - with_warnings: HashSet::new(), - with_orphans: HashSet::new(), - indices_by_version: HashMap::new(), - } + Self::default() } fn update_data(&mut self, data: &S3TimelineBlobData) { @@ -54,6 +52,13 @@ impl MetadataSummary { } fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) { + if analysis.is_healthy() { + self.healthy_tenant_shards.insert(id.tenant_shard_id); + } else { + self.healthy_tenant_shards.remove(&id.tenant_shard_id); + self.unhealthy_tenant_shards.insert(id.tenant_shard_id); + } + if !analysis.errors.is_empty() { self.with_errors.insert(*id); } @@ -101,6 +106,13 @@ Index versions: {version_summary} pub fn is_empty(&self) -> bool { self.timeline_shard_count == 0 } + + pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest { + MetadataHealthUpdateRequest { + healthy_tenant_shards: self.healthy_tenant_shards.clone(), + unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(), + } + } } /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics. @@ -278,13 +290,21 @@ pub async fn scan_metadata( } } - if let BlobDataParseResult::Parsed { - index_part: _index_part, - index_part_generation: _index_part_generation, - s3_layers, - } = &data.blob_data - { - tenant_objects.push(ttid, s3_layers.clone()); + match &data.blob_data { + BlobDataParseResult::Parsed { + index_part: _index_part, + index_part_generation: _index_part_generation, + s3_layers, + } => { + tenant_objects.push(ttid, s3_layers.clone()); + } + BlobDataParseResult::Relic => (), + BlobDataParseResult::Incorrect { + errors: _, + s3_layers, + } => { + tenant_objects.push(ttid, s3_layers.clone()); + } } tenant_timeline_results.push((ttid, data)); } diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 553adf8f46..08a4541c5c 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -1,10 +1,10 @@ use std::{collections::HashSet, str::FromStr, sync::Arc}; -use aws_sdk_s3::Client; use futures::stream::{StreamExt, TryStreamExt}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; use postgres_ffi::{XLogFileName, PG_TLI}; +use remote_storage::GenericRemoteStorage; use serde::Serialize; use tokio_postgres::types::PgLsn; use tracing::{error, info, trace}; @@ -14,8 +14,9 @@ use utils::{ }; use crate::{ - cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, + cloud_admin_api::CloudAdminApiClient, init_remote_generic, + metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget, + TenantShardTimelineId, }; /// Generally we should ask safekeepers, but so far we use everywhere default 16MB. @@ -106,7 +107,7 @@ pub async fn scan_safekeeper_metadata( let timelines = client.query(&query, &[]).await?; info!("loaded {} timelines", timelines.len()); - let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?; + let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?; let console_config = ConsoleConfig::from_env()?; let cloud_admin_api_client = CloudAdminApiClient::new(console_config); @@ -119,7 +120,7 @@ pub async fn scan_safekeeper_metadata( let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg)); let ttid = TenantTimelineId::new(tenant_id, timeline_id); check_timeline( - &s3_client, + &remote_client, &target, &cloud_admin_api_client, ttid, @@ -156,7 +157,7 @@ struct TimelineCheckResult { /// errors are logged to stderr; returns Ok(true) if timeline is consistent, /// Ok(false) if not, Err if failed to check. async fn check_timeline( - s3_client: &Client, + remote_client: &GenericRemoteStorage, root: &RootTarget, api_client: &CloudAdminApiClient, ttid: TenantTimelineId, @@ -187,12 +188,13 @@ async fn check_timeline( // we need files, so unset it. timeline_dir_target.delimiter = String::new(); - let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); + let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target)); while let Some(obj) = stream.next().await { - let obj = obj?; - let key = obj.key(); + let (key, _obj) = obj?; let seg_name = key + .get_path() + .as_str() .strip_prefix(&timeline_dir_target.prefix_in_bucket) .expect("failed to extract segment name"); expected_segfiles.remove(seg_name); diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 5a75f8d40e..1866e6ec80 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -269,7 +269,7 @@ impl SnapshotDownloader { .context("Downloading timeline")?; } BlobDataParseResult::Relic => {} - BlobDataParseResult::Incorrect(_) => { + BlobDataParseResult::Incorrect { .. } => { tracing::error!("Bad metadata in timeline {ttid}"); } }; diff --git a/test_runner/README.md b/test_runner/README.md index 7d95634ea8..e2f26a19ce 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -81,7 +81,7 @@ should go. Useful parameters and commands: `--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk -after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. +after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. If `NeonEnvBuilder#preserve_database_files` set to `True` for a particular test, the whole `repo` directory will be attached to Allure report (thus uploaded to S3) as `everything.tar.zst` for this test. Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them: `./scripts/pytest -s --log-cli-level=INFO ...` diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 038f557cc8..0c36cd6ef7 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -222,6 +222,8 @@ class NeonBenchmarker: function by the zenbenchmark fixture """ + PROPERTY_PREFIX = "neon_benchmarker_" + def __init__(self, property_recorder: Callable[[str, object], None]): # property recorder here is a pytest fixture provided by junitxml module # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property @@ -238,7 +240,7 @@ class NeonBenchmarker: Record a benchmark result. """ # just to namespace the value - name = f"neon_benchmarker_{metric_name}" + name = f"{self.PROPERTY_PREFIX}_{metric_name}" self.property_recorder( name, { @@ -249,6 +251,18 @@ class NeonBenchmarker: }, ) + @classmethod + def records( + cls, user_properties: list[tuple[str, object]] + ) -> Iterator[tuple[str, dict[str, object]]]: + """ + Yield all records related to benchmarks + """ + for property_name, recorded_property in user_properties: + if property_name.startswith(cls.PROPERTY_PREFIX): + assert isinstance(recorded_property, dict) + yield recorded_property["name"], recorded_property + @contextmanager def record_duration(self, metric_name: str) -> Iterator[None]: """ @@ -425,10 +439,11 @@ def zenbenchmark( yield benchmarker results = {} - for _, recorded_property in request.node.user_properties: + for _, recorded_property in NeonBenchmarker.records(request.node.user_properties): name = recorded_property["name"] value = str(recorded_property["value"]) - if (unit := recorded_property["unit"].strip()) != "": + unit = str(recorded_property["unit"]).strip() + if unit != "": value += f" {unit}" results[name] = value @@ -477,7 +492,7 @@ def pytest_terminal_summary( for test_report in terminalreporter.stats.get("passed", []): result_entry = [] - for _, recorded_property in test_report.user_properties: + for _, recorded_property in NeonBenchmarker.records(test_report.user_properties): if not is_header_printed: terminalreporter.section("Benchmark results", "-") is_header_printed = True diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 4836d42db5..cda70be8da 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -133,6 +133,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( *histogram("pageserver_remote_operation_seconds"), *histogram("pageserver_io_operations_seconds"), "pageserver_tenant_states_count", + "pageserver_circuit_breaker_broken_total", + "pageserver_circuit_breaker_unbroken_total", ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( @@ -148,6 +150,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_pitr_history_size", "pageserver_layer_bytes", "pageserver_layer_count", + "pageserver_visible_physical_size", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total", diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 658ed119a1..0636cfad06 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -285,9 +285,9 @@ class NeonApiEndpoint: self.project_id = project_id eps = neon_api.get_endpoints(project_id)["endpoints"] self.endpoint_id = eps[0]["id"] - self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[ - "uri" - ] + self.connstr = neon_api.get_connection_uri( + project_id, endpoint_id=self.endpoint_id, pooled=False + )["uri"] pw = self.connstr.split("@")[0].split(":")[-1] self.pgbench_env = { "PGHOST": eps[0]["host"], diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9e39457c06..c6f4404784 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -449,6 +449,7 @@ class TokenScope(str, Enum): GENERATIONS_API = "generations_api" SAFEKEEPER_DATA = "safekeeperdata" TENANT = "tenant" + SCRUBBER = "scrubber" class NeonEnvBuilder: @@ -523,7 +524,7 @@ class NeonEnvBuilder: self.preserve_database_files = preserve_database_files self.initial_tenant = initial_tenant or TenantId.generate() self.initial_timeline = initial_timeline or TimelineId.generate() - self.scrub_on_exit = False + self.enable_scrub_on_exit = True self.test_output_dir = test_output_dir self.test_overlay_dir = test_overlay_dir self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = [] @@ -542,21 +543,6 @@ class NeonEnvBuilder: f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}" ) - self.pageserver_get_vectored_impl: Optional[str] = None - if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored": - self.pageserver_get_vectored_impl = "vectored" - log.debug('Overriding pageserver get_vectored_impl config to "vectored"') - - self.pageserver_get_impl: Optional[str] = None - if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored": - self.pageserver_get_impl = "vectored" - log.debug('Overriding pageserver get_impl config to "vectored"') - - self.pageserver_validate_vectored_get: Optional[bool] = None - if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None: - self.pageserver_validate_vectored_get = bool(validate) - log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"') - self.pageserver_aux_file_policy = pageserver_aux_file_policy self.safekeeper_extra_opts = safekeeper_extra_opts @@ -852,6 +838,13 @@ class NeonEnvBuilder: ) ident_state_dir.rmdir() # should be empty since we moved `upper` out + def disable_scrub_on_exit(self): + """ + Some tests intentionally leave the remote storage contents empty or corrupt, + so it doesn't make sense to do the usual scrub at the end of the test. + """ + self.enable_scrub_on_exit = False + def overlay_cleanup_teardown(self): """ Unmount the overlayfs mounts created by `self.overlay_mount()`. @@ -877,23 +870,6 @@ class NeonEnvBuilder: # assert all overlayfs mounts in our test directory are gone assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir)) - def enable_scrub_on_exit(self): - """ - Call this if you would like the fixture to automatically run - storage_scrubber at the end of the test, as a bidirectional test - that the scrubber is working properly, and that the code within - the test didn't produce any invalid remote state. - """ - - if not isinstance(self.pageserver_remote_storage, S3Storage): - # The scrubber can't talk to e.g. LocalFS -- it needs - # an HTTP endpoint (mock is fine) to connect to. - raise RuntimeError( - "Cannot scrub with remote_storage={self.pageserver_remote_storage}, require an S3 endpoint" - ) - - self.scrub_on_exit = True - def enable_pageserver_remote_storage( self, remote_storage_kind: RemoteStorageKind, @@ -995,9 +971,17 @@ class NeonEnvBuilder: ) cleanup_error = None - if self.scrub_on_exit: + # If we are running with S3Storage (required by the scrubber), check that whatever the test + # did does not generate any corruption + if ( + isinstance(self.env.pageserver_remote_storage, S3Storage) + and self.enable_scrub_on_exit + ): try: - self.env.storage_scrubber.scan_metadata() + healthy, _ = self.env.storage_scrubber.scan_metadata() + if not healthy: + e = Exception("Remote storage metadata corrupted") + cleanup_error = e except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -1162,12 +1146,6 @@ class NeonEnv: } if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine - if config.pageserver_get_vectored_impl is not None: - ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl - if config.pageserver_get_impl is not None: - ps_cfg["get_impl"] = config.pageserver_get_impl - if config.pageserver_validate_vectored_get is not None: - ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get if config.pageserver_default_tenant_config_compaction_algorithm is not None: tenant_config = ps_cfg.setdefault("tenant_config", {}) tenant_config[ @@ -1420,7 +1398,7 @@ def _shared_simple_env( pg_distrib_dir=pg_distrib_dir, pg_version=pg_version, run_id=run_id, - preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), test_name=request.node.name, test_output_dir=test_output_dir, pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, @@ -1467,6 +1445,7 @@ def neon_env_builder( pageserver_virtual_file_io_engine: str, pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], pageserver_aux_file_policy: Optional[AuxFileStore], + record_property: Callable[[str, object], None], ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1495,7 +1474,7 @@ def neon_env_builder( pg_version=pg_version, broker=default_broker, run_id=run_id, - preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, test_name=request.node.name, test_output_dir=test_output_dir, @@ -1504,6 +1483,9 @@ def neon_env_builder( pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, ) as builder: yield builder + # Propogate `preserve_database_files` to make it possible to use in other fixtures, + # like `test_output_dir` fixture for attaching all database files to Allure report. + record_property("preserve_database_files", builder.preserve_database_files) @dataclass @@ -1964,11 +1946,15 @@ class NeonCli(AbstractNeonCli): remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, allow_multiple=False, + basebackup_request_tries: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", "start", ] + extra_env_vars = {} + if basebackup_request_tries is not None: + extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) if remote_ext_config is not None: args.extend(["--remote-ext-config", remote_ext_config]) @@ -1981,7 +1967,7 @@ class NeonCli(AbstractNeonCli): if allow_multiple: args.extend(["--allow-multiple"]) - res = self.raw_cli(args) + res = self.raw_cli(args, extra_env_vars) res.check_returncode() return res @@ -2148,6 +2134,23 @@ class StorageControllerApiException(Exception): self.status_code = status_code +# See libs/pageserver_api/src/controller_api.rs +# for the rust definitions of the enums below +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class PageserverAvailability(str, Enum): + ACTIVE = "Active" + UNAVAILABLE = "Unavailable" + OFFLINE = "Offline" + + +class PageserverSchedulingPolicy(str, Enum): + ACTIVE = "Active" + DRAINING = "Draining" + FILLING = "Filling" + PAUSE = "Pause" + PAUSE_FOR_RESTART = "PauseForRestart" + + class NeonStorageController(MetricsGetter, LogUtils): def __init__(self, env: NeonEnv, auth_enabled: bool): self.env = env @@ -2531,26 +2534,54 @@ class NeonStorageController(MetricsGetter, LogUtils): ) log.info("storage controller passed consistency check") + def node_registered(self, node_id: int) -> bool: + """ + Returns true if the storage controller can confirm + it knows of pageserver with 'node_id' + """ + try: + self.node_status(node_id) + except StorageControllerApiException as e: + if e.status_code == 404: + return False + else: + raise e + + return True + def poll_node_status( - self, node_id: int, desired_scheduling_policy: str, max_attempts: int, backoff: int + self, + node_id: int, + desired_availability: Optional[PageserverAvailability], + desired_scheduling_policy: Optional[PageserverSchedulingPolicy], + max_attempts: int, + backoff: int, ): """ - Poll the node status until it reaches 'desired_scheduling_policy' or 'max_attempts' have been exhausted + Poll the node status until it reaches 'desired_scheduling_policy' and 'desired_availability' + or 'max_attempts' have been exhausted """ - log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy") + log.info( + f"Polling {node_id} for {desired_scheduling_policy} scheduling policy and {desired_availability} availability" + ) while max_attempts > 0: try: status = self.node_status(node_id) policy = status["scheduling"] - if policy == desired_scheduling_policy: + availability = status["availability"] + if (desired_scheduling_policy is None or policy == desired_scheduling_policy) and ( + desired_availability is None or availability == desired_availability + ): return else: max_attempts -= 1 - log.info(f"Status call returned {policy=} ({max_attempts} attempts left)") + log.info( + f"Status call returned {policy=} {availability=} ({max_attempts} attempts left)" + ) if max_attempts == 0: raise AssertionError( - f"Status for {node_id=} did not reach {desired_scheduling_policy=}" + f"Status for {node_id=} did not reach {desired_scheduling_policy=} {desired_availability=}" ) time.sleep(backoff) @@ -2563,6 +2594,62 @@ class NeonStorageController(MetricsGetter, LogUtils): time.sleep(backoff) + def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]): + body: Dict[str, Any] = { + "healthy_tenant_shards": [str(t) for t in healthy], + "unhealthy_tenant_shards": [str(t) for t in unhealthy], + } + + self.request( + "POST", + f"{self.env.storage_controller_api}/control/v1/metadata_health/update", + json=body, + headers=self.headers(TokenScope.SCRUBBER), + ) + + def metadata_health_list_unhealthy(self): + response = self.request( + "GET", + f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def metadata_health_list_outdated(self, duration: str): + body: Dict[str, Any] = {"not_scrubbed_for": duration} + + response = self.request( + "POST", + f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool: + """Metadata is healthy if there is no unhealthy or outdated health records.""" + + unhealthy = self.metadata_health_list_unhealthy() + outdated = self.metadata_health_list_outdated(outdated_duration) + + healthy = ( + len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0 + ) + if not healthy: + log.info(f"{unhealthy=}, {outdated=}") + return healthy + + def step_down(self): + log.info("Asking storage controller to step down") + response = self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/step_down", + headers=self.headers(TokenScope.ADMIN), + ) + + response.raise_for_status() + return response.json() + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): if isinstance(config_strings, tuple): pairs = [config_strings] @@ -2694,6 +2781,14 @@ class NeonPageserver(PgProtocol, LogUtils): self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds ) self.running = True + + if self.env.storage_controller.running and self.env.storage_controller.node_registered( + self.id + ): + self.env.storage_controller.poll_node_status( + self.id, PageserverAvailability.ACTIVE, None, max_attempts=20, backoff=1 + ) + return self def stop(self, immediate: bool = False) -> "NeonPageserver": @@ -3724,6 +3819,7 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id: Optional[int] = None, safekeepers: Optional[List[int]] = None, allow_multiple: bool = False, + basebackup_request_tries: Optional[int] = None, ) -> "Endpoint": """ Start the Postgres instance. @@ -3745,6 +3841,7 @@ class Endpoint(PgProtocol, LogUtils): remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, allow_multiple=allow_multiple, + basebackup_request_tries=basebackup_request_tries, ) self._running.release(1) @@ -3891,6 +3988,7 @@ class Endpoint(PgProtocol, LogUtils): remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, allow_multiple=False, + basebackup_request_tries: Optional[int] = None, ) -> "Endpoint": """ Create an endpoint, apply config, and start Postgres. @@ -3911,6 +4009,7 @@ class Endpoint(PgProtocol, LogUtils): remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, allow_multiple=allow_multiple, + basebackup_request_tries=basebackup_request_tries, ) log.info(f"Postgres startup took {time.time() - started_at} seconds") @@ -3954,6 +4053,7 @@ class EndpointFactory: config_lines: Optional[List[str]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + basebackup_request_tries: Optional[int] = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -3972,6 +4072,7 @@ class EndpointFactory: lsn=lsn, remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, + basebackup_request_tries=basebackup_request_tries, ) def create( @@ -4313,13 +4414,19 @@ class StorageScrubber: assert stdout is not None return stdout - def scan_metadata(self) -> Any: - stdout = self.scrubber_cli( - ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30 - ) + def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]: + """ + Returns the health status and the metadata summary. + """ + args = ["scan-metadata", "--node-kind", "pageserver", "--json"] + if post_to_storage_controller: + args.append("--post") + stdout = self.scrubber_cli(args, timeout=30) try: - return json.loads(stdout) + summary = json.loads(stdout) + healthy = not summary["with_errors"] and not summary["with_warnings"] + return healthy, summary except: log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:") log.error(stdout) @@ -4440,7 +4547,23 @@ def test_output_dir( yield test_dir - allure_attach_from_dir(test_dir) + # Allure artifacts creation might involve the creation of `.tar.zst` archives, + # which aren't going to be used if Allure results collection is not enabled + # (i.e. --alluredir is not set). + # Skip `allure_attach_from_dir` in this case + if not request.config.getoption("--alluredir"): + return + + preserve_database_files = False + for k, v in request.node.user_properties: + # NB: the neon_env_builder fixture uses this fixture (test_output_dir). + # So, neon_env_builder's cleanup runs before here. + # The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property. + if k == "preserve_database_files": + assert isinstance(v, bool) + preserve_database_files = v + + allure_attach_from_dir(test_dir, preserve_database_files) class FileAndThreadLock: diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index c6df6b5baf..65d6ff5d62 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -61,6 +61,7 @@ class HistoricLayerInfo: remote: bool # None for image layers, true if pageserver thinks this is an L0 delta layer l0: Optional[bool] + visible: bool @classmethod def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: @@ -79,6 +80,7 @@ class HistoricLayerInfo: lsn_end=d.get("lsn_end"), remote=d["remote"], l0=l0_ness, + visible=d["access_stats"]["visible"], ) @@ -556,6 +558,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json + def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc", + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + + def timeline_unblock_gc( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc", + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + def timeline_compact( self, tenant_id: Union[TenantId, TenantShardId], @@ -663,6 +681,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): force_image_layer_creation=False, wait_until_uploaded=False, compact: Optional[bool] = None, + **kwargs, ): self.is_testing_enabled_or_skip() query = {} @@ -680,6 +699,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", params=query, + **kwargs, ) log.info(f"Got checkpoint request response code: {res.status_code}") self.verbose_error(res) @@ -837,7 +857,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, batch_size: int | None = None, **kwargs, - ) -> List[TimelineId]: + ) -> Set[TimelineId]: params = {} if batch_size is not None: params["batch_size"] = batch_size @@ -848,7 +868,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): ) self.verbose_error(res) json = res.json() - return list(map(TimelineId, json["reparented_timelines"])) + return set(map(TimelineId, json["reparented_timelines"])) def evict_layer( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 0989dc1893..4dc9f7caae 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -240,9 +240,18 @@ ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] ) -def allure_attach_from_dir(dir: Path): +def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False): """Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report""" + if preserve_database_files: + zst_file = dir.with_suffix(".tar.zst") + with zst_file.open("wb") as zst: + cctx = zstandard.ZstdCompressor() + with cctx.stream_writer(zst) as compressor: + with tarfile.open(fileobj=compressor, mode="w") as tar: + tar.add(dir, arcname="") + allure.attach.file(zst_file, "everything.tar.zst", "application/zstd", "tar.zst") + for attachment in Path(dir).glob("**/*"): if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0: name = str(attachment.relative_to(dir)) @@ -380,7 +389,10 @@ WaitUntilRet = TypeVar("WaitUntilRet") def wait_until( - number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet] + number_of_iterations: int, + interval: float, + func: Callable[[], WaitUntilRet], + show_intermediate_error=False, ) -> WaitUntilRet: """ Wait until 'func' returns successfully, without exception. Returns the @@ -393,6 +405,8 @@ def wait_until( except Exception as e: log.info("waiting for %s iteration %s failed", func, i + 1) last_exception = e + if show_intermediate_error: + log.info(e) time.sleep(interval) continue return res diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md new file mode 100644 index 0000000000..8eca056dda --- /dev/null +++ b/test_runner/logical_repl/README.md @@ -0,0 +1,22 @@ +# Logical replication tests + +## Clickhouse + +```bash +export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb + +docker compose -f clickhouse/docker-compose.yml up -d +pytest -m remote_cluster -k test_clickhouse +docker compose -f clickhouse/docker-compose.yml down +``` + +## Debezium + +```bash +export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb + +docker compose -f debezium/docker-compose.yml up -d +pytest -m remote_cluster -k test_debezium +docker compose -f debezium/docker-compose.yml down + +``` \ No newline at end of file diff --git a/test_runner/logical_repl/clickhouse/docker-compose.yml b/test_runner/logical_repl/clickhouse/docker-compose.yml new file mode 100644 index 0000000000..e00038b811 --- /dev/null +++ b/test_runner/logical_repl/clickhouse/docker-compose.yml @@ -0,0 +1,9 @@ +services: + clickhouse: + image: clickhouse/clickhouse-server + user: "101:101" + container_name: clickhouse + hostname: clickhouse + ports: + - 127.0.0.1:8123:8123 + - 127.0.0.1:9000:9000 diff --git a/test_runner/logical_repl/debezium/docker-compose.yml b/test_runner/logical_repl/debezium/docker-compose.yml new file mode 100644 index 0000000000..fee127a2fd --- /dev/null +++ b/test_runner/logical_repl/debezium/docker-compose.yml @@ -0,0 +1,24 @@ +services: + zookeeper: + image: quay.io/debezium/zookeeper:2.7 + kafka: + image: quay.io/debezium/kafka:2.7 + environment: + ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + ports: + - 127.0.0.1:9092:9092 + debezium: + image: quay.io/debezium/connect:2.7 + environment: + BOOTSTRAP_SERVERS: kafka:9092 + GROUP_ID: 1 + CONFIG_STORAGE_TOPIC: debezium-config + OFFSET_STORAGE_TOPIC: debezium-offset + STATUS_STORAGE_TOPIC: debezium-status + DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector + ports: + - 127.0.0.1:8083:8083 diff --git a/test_runner/logical_repl/test_clickhouse.py b/test_runner/logical_repl/test_clickhouse.py new file mode 100644 index 0000000000..c5ed9bc8af --- /dev/null +++ b/test_runner/logical_repl/test_clickhouse.py @@ -0,0 +1,82 @@ +""" +Test the logical replication in Neon with ClickHouse as a consumer +""" + +import hashlib +import os +import time + +import clickhouse_connect +import psycopg2 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import RemotePostgres +from fixtures.utils import wait_until + + +def query_clickhouse( + client, + query: str, + digest: str, +) -> None: + """ + Run the query on the client + return answer if successful, raise an exception otherwise + """ + log.debug("Query: %s", query) + res = client.query(query) + log.debug(res.result_rows) + m = hashlib.sha1() + m.update(repr(tuple(res.result_rows)).encode()) + hash_res = m.hexdigest() + log.debug("Hash: %s", hash_res) + if hash_res == digest: + return + raise ValueError("Hash mismatch") + + +@pytest.mark.remote_cluster +def test_clickhouse(remote_pg: RemotePostgres): + """ + Test the logical replication having ClickHouse as a client + """ + clickhouse_host = "clickhouse" if ("CI" in os.environ) else "127.0.0.1" + conn_options = remote_pg.conn_options() + conn = psycopg2.connect(remote_pg.connstr()) + cur = conn.cursor() + cur.execute("DROP TABLE IF EXISTS table1") + cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));") + cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');") + conn.commit() + client = clickhouse_connect.get_client(host=clickhouse_host) + client.command("SET allow_experimental_database_materialized_postgresql=1") + client.command( + "CREATE DATABASE db1_postgres ENGINE = " + f"MaterializedPostgreSQL('{conn_options['host']}', " + f"'{conn_options['dbname']}', " + f"'{conn_options['user']}', '{conn_options['password']}') " + "SETTINGS materialized_postgresql_tables_list = 'table1';" + ) + wait_until( + 120, + 0.5, + lambda: query_clickhouse( + client, + "select * from db1_postgres.table1 order by 1", + "ee600d8f7cd05bd0b169fa81f44300a9dd10085a", + ), + ) + cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');") + conn.commit() + wait_until( + 120, + 0.5, + lambda: query_clickhouse( + client, + "select * from db1_postgres.table1 order by 1", + "9eba2daaf7e4d7d27ac849525f68b562ab53947d", + ), + ) + log.debug("Sleeping before final checking if Neon is still alive") + time.sleep(3) + cur.execute("SELECT 1") diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py new file mode 100644 index 0000000000..700b731418 --- /dev/null +++ b/test_runner/logical_repl/test_debezium.py @@ -0,0 +1,189 @@ +""" +Test the logical replication in Neon with Debezium as a consumer +""" + +import json +import os +import time + +import psycopg2 +import pytest +import requests +from fixtures.log_helper import log +from fixtures.neon_fixtures import RemotePostgres +from fixtures.utils import wait_until +from kafka import KafkaConsumer + + +class DebeziumAPI: + """ + The class for Debezium API calls + """ + + def __init__(self): + self.__host = "debezium" if ("CI" in os.environ) else "127.0.0.1" + self.__base_url = f"http://{self.__host}:8083" + self.__connectors_url = f"{self.__base_url}/connectors" + + def __request(self, method, addurl="", **kwargs): + return requests.request( + method, + self.__connectors_url + addurl, + headers={"Accept": "application/json", "Content-type": "application/json"}, + timeout=60, + **kwargs, + ) + + def create_pg_connector(self, remote_pg: RemotePostgres, dbz_conn_name: str): + """ + Create a Postgres connector in debezium + """ + conn_options = remote_pg.conn_options() + payload = { + "name": dbz_conn_name, + "config": { + "connector.class": "io.debezium.connector.postgresql.PostgresConnector", + "tasks.max": "1", + "database.hostname": conn_options["host"], + "database.port": "5432", + "database.user": conn_options["user"], + "database.password": conn_options["password"], + "database.dbname": conn_options["dbname"], + "plugin.name": "pgoutput", + "topic.prefix": "dbserver1", + "schema.include.list": "inventory", + }, + } + return self.__request("POST", json=payload) + + def list_connectors(self): + """ + Returns a list of all connectors existent in Debezium. + """ + resp = self.__request("GET") + assert resp.ok + return json.loads(resp.text) + + def del_connector(self, connector): + """ + Deletes the specified connector + """ + return self.__request("DELETE", f"/{connector}") + + +@pytest.fixture(scope="function") +def debezium(remote_pg: RemotePostgres): + """ + Prepare the Debezium API handler, connection + """ + conn = psycopg2.connect(remote_pg.connstr()) + cur = conn.cursor() + cur.execute("DROP SCHEMA IF EXISTS inventory CASCADE") + cur.execute("CREATE SCHEMA inventory") + cur.execute( + "CREATE TABLE inventory.customers (" + "id SERIAL NOT NULL PRIMARY KEY," + "first_name character varying(255) NOT NULL," + "last_name character varying(255) NOT NULL," + "email character varying(255) NOT NULL)" + ) + conn.commit() + dbz = DebeziumAPI() + assert len(dbz.list_connectors()) == 0 + dbz_conn_name = "inventory-connector" + resp = dbz.create_pg_connector(remote_pg, dbz_conn_name) + log.debug("%s %s %s", resp.status_code, resp.ok, resp.text) + assert resp.status_code == 201 + assert len(dbz.list_connectors()) == 1 + consumer = KafkaConsumer( + "dbserver1.inventory.customers", + bootstrap_servers=["kafka:9092"], + auto_offset_reset="earliest", + enable_auto_commit=False, + ) + yield conn, consumer + resp = dbz.del_connector(dbz_conn_name) + assert resp.status_code == 204 + + +def get_kafka_msg(consumer, ts_ms, before=None, after=None) -> None: + """ + Gets the message from Kafka and checks its validity + Arguments: + consumer: the consumer object + ts_ms: timestamp in milliseconds of the change of db, the corresponding message must have + the later timestamp + before: a dictionary, if not None, the before field from the kafka message must + have the same values for the same keys + after: a dictionary, if not None, the after field from the kafka message must + have the same values for the same keys + """ + msg = consumer.poll() + assert msg, "Empty message" + for val in msg.values(): + r = json.loads(val[-1].value) + log.info(r["payload"]) + assert ts_ms < r["payload"]["ts_ms"], "Incorrect timestamp" + for param, pname in ((before, "before"), (after, "after")): + if param is not None: + for k, v in param.items(): + assert r["payload"][pname][k] == v, f"{pname} mismatches" + + +@pytest.mark.remote_cluster +def test_debezium(debezium): + """ + Test the logical replication having Debezium as a subscriber + """ + conn, consumer = debezium + cur = conn.cursor() + ts_ms = time.time() * 1000 + log.info("Insert 1 ts_ms: %s", ts_ms) + cur.execute( + "insert into inventory.customers (first_name, last_name, email) " + "values ('John', 'Dow','johndow@example.com')" + ) + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"}, + ), + show_intermediate_error=True, + ) + ts_ms = time.time() * 1000 + log.info("Insert 2 ts_ms: %s", ts_ms) + cur.execute( + "insert into inventory.customers (first_name, last_name, email) " + "values ('Alex', 'Row','alexrow@example.com')" + ) + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"}, + ), + show_intermediate_error=True, + ) + ts_ms = time.time() * 1000 + log.info("Update ts_ms: %s", ts_ms) + cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2") + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "Alexander"}, + ), + show_intermediate_error=True, + ) + time.sleep(3) + cur.execute("select 1") diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index 4c326111c2..9861259c16 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -6,21 +6,8 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -@pytest.mark.timeout(10000) -def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): - """ - Test that GC is able to collect all old layers even if them are forming - "stairs" and there are not three delta layers since last image layer. - - Information about image layers needed to collect old layers should - be propagated by GC to compaction task which should take in in account - when make a decision which new image layers needs to be created. - - NB: this test demonstrates the problem. The source tree contained the - `gc_feedback` mechanism for about 9 months, but, there were problems - with it and it wasn't enabled at runtime. - This PR removed the code: https://github.com/neondatabase/neon/pull/6863 - """ +def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str): + assert mode == "normal" or mode == "with_snapshots" env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -74,6 +61,9 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"] log.info(f"Physical storage size {physical_size}") + if mode == "with_snapshots": + if step == n_steps / 2: + env.neon_cli.create_branch("child") max_num_of_deltas_above_image = 0 max_total_num_of_deltas = 0 @@ -149,3 +139,37 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma log.info(f"Writing layer map to {layer_map_path}") with layer_map_path.open("w") as f: f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id))) + + +@pytest.mark.timeout(10000) +def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + """ + Test that GC is able to collect all old layers even if them are forming + "stairs" and there are not three delta layers since last image layer. + + Information about image layers needed to collect old layers should + be propagated by GC to compaction task which should take in in account + when make a decision which new image layers needs to be created. + + NB: this test demonstrates the problem. The source tree contained the + `gc_feedback` mechanism for about 9 months, but, there were problems + with it and it wasn't enabled at runtime. + This PR removed the code: https://github.com/neondatabase/neon/pull/6863 + + And the bottom-most GC-compaction epic resolves the problem. + https://github.com/neondatabase/neon/issues/8002 + """ + gc_feedback_impl(neon_env_builder, zenbenchmark, "normal") + + +@pytest.mark.timeout(10000) +def test_gc_feedback_with_snapshots( + neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker +): + """ + Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle + of the benchmark, and the bottom-most compaction should collect as much garbage as possible below the GC + horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point, + and images covering the full key range (in a delta layer) at the GC horizon. + """ + gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots") diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 53bb29a659..4b4ffc1fee 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -100,24 +100,32 @@ def test_subscriber_lag( pub_connstr = benchmark_project_pub.connstr sub_connstr = benchmark_project_sub.connstr - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + if benchmark_project_pub.is_new: + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + if benchmark_project_sub.is_new: + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) pub_conn = psycopg2.connect(pub_connstr) sub_conn = psycopg2.connect(sub_connstr) pub_conn.autocommit = True sub_conn.autocommit = True with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - if benchmark_project_pub.is_new: - pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history") + pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'") + pub_exists = len(pub_cur.fetchall()) != 0 - if benchmark_project_sub.is_new: + if not pub_exists: + pub_cur.execute("CREATE PUBLICATION pub1 FOR TABLE pgbench_accounts, pgbench_history") + + sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'") + sub_exists = len(sub_cur.fetchall()) != 0 + if not sub_exists: sub_cur.execute("truncate table pgbench_accounts") sub_cur.execute("truncate table pgbench_history") - sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1") + sub_cur.execute(f"CREATE SUBSCRIPTION sub1 CONNECTION '{pub_connstr}' PUBLICATION pub1") initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() sub_conn.close() @@ -195,10 +203,15 @@ def test_publisher_restart( pub_conn.autocommit = True sub_conn.autocommit = True with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - if benchmark_project_pub.is_new: + pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'") + pub_exists = len(pub_cur.fetchall()) != 0 + + if not pub_exists: pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history") - if benchmark_project_sub.is_new: + sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'") + sub_exists = len(sub_cur.fetchall()) != 0 + if not sub_exists: sub_cur.execute("truncate table pgbench_accounts") sub_cur.execute("truncate table pgbench_history") diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 3a6113706f..04785f7184 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -8,7 +8,12 @@ import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PageserverAvailability, + PageserverSchedulingPolicy, +) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pg_version import PgVersion @@ -106,7 +111,8 @@ def test_storage_controller_many_tenants( # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to # guard against regressions in restart time. - "max_unavailable": "300s" + "max_offline": "30s", + "max_warming_up": "300s", } neon_env_builder.control_plane_compute_hook_api = ( compute_reconfigure_listener.control_plane_compute_hook_api @@ -211,7 +217,11 @@ def test_storage_controller_many_tenants( # A reconciler operation: migrate a shard. shard_number = rng.randint(0, shard_count - 1) tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count) - dest_ps_id = rng.choice([ps.id for ps in env.pageservers]) + + # Migrate it to its secondary location + desc = env.storage_controller.tenant_describe(tenant_id) + dest_ps_id = desc["shards"][shard_number]["node_secondary"][0] + f = executor.submit( env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id ) @@ -225,7 +235,11 @@ def test_storage_controller_many_tenants( for f in futs: f.result() - # Consistency check is safe here: all the previous operations waited for reconcile before completing + # Some of the operations above (notably migrations) might leave the controller in a state where it has + # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system + # to reach a quiescent state before doing following checks. + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() check_memory() @@ -274,7 +288,11 @@ def test_storage_controller_many_tenants( ) env.storage_controller.poll_node_status( - ps.id, "PauseForRestart", max_attempts=24, backoff=5 + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=24, + backoff=5, ) shard_counts = get_consistent_node_shard_counts(env, total_shards) @@ -285,12 +303,24 @@ def test_storage_controller_many_tenants( assert sum(shard_counts.values()) == total_shards ps.restart() - env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=1) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=24, + backoff=1, + ) env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 ) - env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=5) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=24, + backoff=5, + ) shard_counts = get_consistent_node_shard_counts(env, total_shards) log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index 82a3a05c2b..392b73c1f7 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -10,7 +10,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder @pytest.mark.timeout(600) def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*simulated connection error.*") + env.pageserver.allowed_errors.append(".*simulated connection error.*") # this is never hit + + # the real reason (Simulated Connection Error) is on the next line, and we cannot filter this out. + env.pageserver.allowed_errors.append( + ".*ERROR error in page_service connection task: Postgres query error" + ) # Enable failpoint before starting everything else up so that we exercise the retry # on fetching basebackup @@ -69,3 +74,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): cur.fetchall() times_executed += 1 log.info(f"Workload executed {times_executed} times") + + # do a graceful shutdown which would had caught the allowed_errors before + # https://github.com/neondatabase/neon/pull/8632 + env.pageserver.stop() diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 03d6946c15..fc74707639 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -18,7 +18,6 @@ from fixtures.pageserver.utils import wait_until_tenant_active from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException -from requests.exceptions import RetryError # Test branch creation @@ -151,7 +150,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.pageserver.allowed_errors.extend( [ ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", - ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading", + ".*page_service_conn_main.*: query handler for 'basebackup .* ERROR: Not found: Timeline", ] ) ps_http = env.pageserver.http_client() @@ -176,10 +175,12 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline) - with pytest.raises(RuntimeError, match="is not active, state: Loading"): - env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant) + with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"): + env.endpoints.create_start( + initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2 + ) + ps_http.configure_failpoints(("before-upload-index-pausable", "off")) finally: - # FIXME: paused uploads bother shutdown env.pageserver.stop(immediate=True) t.join() @@ -193,8 +194,11 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.append( - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*" + env.pageserver.allowed_errors.extend( + [ + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: .*Cannot branch off the timeline that's not present in pageserver.*", + ] ) ps_http = env.pageserver.http_client() @@ -216,7 +220,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder branch_id = TimelineId.generate() - with pytest.raises(RetryError, match="too many 503 error responses"): + with pytest.raises( + PageserverApiException, + match="Cannot branch off the timeline that's not present in pageserver", + ): ps_http.timeline_create( env.pg_version, env.initial_tenant, @@ -389,6 +396,11 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder): repeat_result = ps_http.timeline_create( env.pg_version, env.initial_tenant, success_timeline, timeout=60 ) + # remote_consistent_lsn_visible will be published only after we've + # confirmed the generation, which is not part of what we await during + # timeline creation (uploads). mask it out here to avoid flakyness. + del success_result["remote_consistent_lsn_visible"] + del repeat_result["remote_consistent_lsn_visible"] assert repeat_result == success_result finally: env.pageserver.stop(immediate=True) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 61afd820ca..5ec9a22ba1 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -17,22 +17,17 @@ from fixtures.pg_version import PgVersion # Test restarting page server, while safekeeper and compute node keep # running. def test_local_corruption(neon_env_builder: NeonEnvBuilder): - if neon_env_builder.pageserver_get_impl == "vectored": - reconstruct_function_name = "get_values_reconstruct_data" - else: - reconstruct_function_name = "get_value_reconstruct_data" - env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend( [ - f".*{reconstruct_function_name} for layer .*", + ".*get_values_reconstruct_data for layer .*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", ".*failed to load metadata.*", ".*load failed.*load local timeline.*", - ".*layer loading failed permanently: load layer: .*", + ".*: layer load failed, assuming permanent failure:.*", ] ) @@ -79,7 +74,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err: + with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err: pg1.start() log.info( f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}" diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 65649e0c0a..afa5f6873c 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -3,18 +3,15 @@ import re import shutil import subprocess import tempfile +from dataclasses import dataclass from pathlib import Path from typing import List, Optional import pytest import toml -from fixtures.common_types import Lsn +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - NeonEnv, - NeonEnvBuilder, - PgBin, -) +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( timeline_delete_wait_completed, @@ -22,7 +19,8 @@ from fixtures.pageserver.utils import ( wait_for_upload, ) from fixtures.pg_version import PgVersion -from fixtures.remote_storage import RemoteStorageKind +from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage +from fixtures.workload import Workload # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. @@ -227,12 +225,6 @@ def test_forward_compatibility( ) try: - # Previous version neon_local and pageserver are not aware - # of the new config. - # TODO: remove these once the previous version of neon local supports them - neon_env_builder.pageserver_get_impl = None - neon_env_builder.pageserver_validate_vectored_get = None - neon_env_builder.num_safekeepers = 3 # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.). @@ -415,3 +407,132 @@ def dump_differs( break return differs + + +@dataclass +class HistoricDataSet: + name: str + tenant_id: TenantId + pg_version: PgVersion + url: str + + def __str__(self): + return self.name + + +HISTORIC_DATA_SETS = [ + # From before we enabled image layer compression. + # - IndexPart::LATEST_VERSION 7 + # - STORAGE_FORMAT_VERSION 3 + HistoricDataSet( + "2024-07-18", + TenantId("17bf64a53509714687664b3a84e9b3ba"), + PgVersion.V16, + "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst", + ), +] + + +@pytest.mark.parametrize("dataset", HISTORIC_DATA_SETS) +@pytest.mark.xdist_group("compatibility") +def test_historic_storage_formats( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, + pg_version: PgVersion, + dataset: HistoricDataSet, +): + """ + This test is like test_backward_compatibility, but it looks back further to examples of our storage format from long ago. + """ + + ARTIFACT_CACHE_DIR = "./artifact_cache" + + import tarfile + from contextlib import closing + + import requests + import zstandard + + artifact_unpack_path = ARTIFACT_CACHE_DIR / Path("unpacked") / Path(dataset.name) + + # Note: we assume that when running across a matrix of PG versions, the matrix includes all the versions needed by + # HISTORIC_DATA_SETS. If we ever remove a PG version from the matrix, then historic datasets built using that version + # will no longer be covered by this test. + if pg_version != dataset.pg_version: + pytest.skip(f"Dataset {dataset} is for different PG version, skipping") + + with closing(requests.get(dataset.url, stream=True)) as r: + unzstd = zstandard.ZstdDecompressor() + with unzstd.stream_reader(r.raw) as stream: + with tarfile.open(mode="r|", fileobj=stream) as tf: + tf.extractall(artifact_unpack_path) + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.pg_version = dataset.pg_version + env = neon_env_builder.init_configs() + env.start() + assert isinstance(env.pageserver_remote_storage, S3Storage) + + # Link artifact data into test's remote storage. We don't want the whole repo dir, just the remote storage part: we are not testing + # compat of local disk data across releases (test_backward_compat does that), we're testing really long-lived data in S3 like layer files and indices. + # + # The code generating the snapshot uses local_fs, but this test uses S3Storage, so we are copying a tree of files into a bucket. We use + # S3Storage so that the scrubber can run (the scrubber doesn't speak local_fs) + artifact_pageserver_path = ( + artifact_unpack_path / Path("repo") / Path("local_fs_remote_storage") / Path("pageserver") + ) + for root, _dirs, files in os.walk(artifact_pageserver_path): + for file in files: + local_path = os.path.join(root, file) + remote_key = ( + env.pageserver_remote_storage.prefix_in_bucket + + str(local_path)[len(str(artifact_pageserver_path)) :] + ) + log.info(f"Uploading {local_path} -> {remote_key}") + env.pageserver_remote_storage.client.upload_file( + local_path, env.pageserver_remote_storage.bucket_name, remote_key + ) + + # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt) + # + # Do this _before_ importing to the pageserver, as that import may start writing immediately + healthy, metadata_summary = env.storage_scrubber.scan_metadata() + assert healthy + assert metadata_summary["tenant_count"] >= 1 + assert metadata_summary["timeline_count"] >= 1 + + env.neon_cli.import_tenant(dataset.tenant_id) + + # Discover timelines + timelines = env.pageserver.http_client().timeline_list(dataset.tenant_id) + # All our artifacts should contain at least one timeline + assert len(timelines) > 0 + + # TODO: ensure that the snapshots we're importing contain a sensible variety of content, at the very + # least they should include a mixture of deltas and image layers. Preferably they should also + # contain some "exotic" stuff like aux files from logical replication. + + # Check we can start an endpoint and read the SQL that the artifact is meant to contain + reference_sql_dump = artifact_unpack_path / Path("dump.sql") + ep = env.endpoints.create_start("main", tenant_id=dataset.tenant_id) + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) + pg_bin.run_capture( + ["pg_dumpall", f"--dbname={ep.connstr()}", f"--file={test_output_dir / 'dump.sql'}"] + ) + assert not dump_differs( + reference_sql_dump, + test_output_dir / "dump.sql", + test_output_dir / "dump.filediff", + ) + ep.stop() + + # Check we can also do writes to the database + existing_timeline_id = TimelineId(timelines[0]["timeline_id"]) + workload = Workload(env, dataset.tenant_id, existing_timeline_id) + workload.init() + workload.write_rows(100) + + # Check that compaction works + env.pageserver.http_client().timeline_compact( + dataset.tenant_id, existing_timeline_id, force_image_layer_creation=True + ) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 91c7b97fdd..85616c3fe2 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -591,7 +591,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): abs_diff = abs(ratio - expected_ratio) assert original_count > count_now - expectation = 0.06 + expectation = 0.065 log.info( f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" ) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 041942cda3..73af7950f1 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -214,12 +214,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = env.storage_scrubber.scan_metadata() + healthy, metadata_summary = env.storage_scrubber.scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 - assert not metadata_summary["with_errors"] - assert not metadata_summary["with_warnings"] + assert healthy def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): @@ -596,19 +595,26 @@ def test_multi_attach( for ps in pageservers: ps.stop() - # Returning to a normal healthy state: all pageservers will start, but only the one most - # recently attached via the control plane will re-attach on startup + # Returning to a normal healthy state: all pageservers will start for ps in pageservers: ps.start() - with pytest.raises(PageserverApiException): - _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) - with pytest.raises(PageserverApiException): - _detail = http_clients[1].timeline_detail(tenant_id, timeline_id) - _detail = http_clients[2].timeline_detail(tenant_id, timeline_id) + # Pageservers are marked offline by the storage controller during the rolling restart + # above. This may trigger a reschedulling, so there's no guarantee that the tenant + # shard ends up attached to the most recent ps. + raised = 0 + serving_ps_idx = None + for idx, http_client in enumerate(http_clients): + try: + _detail = http_client.timeline_detail(tenant_id, timeline_id) + serving_ps_idx = idx + except PageserverApiException: + raised += 1 + + assert raised == 2 and serving_ps_idx is not None # All data we wrote while multi-attached remains readable - workload.validate(pageservers[2].id) + workload.validate(pageservers[serving_ps_idx].id) def test_upgrade_generationless_local_file_paths( diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 4ce53df214..68a45f957c 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -13,7 +13,10 @@ from fixtures.utils import wait_until # running. def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() + + # We inject a delay of 15 seconds for tenant activation below. + # Hence, bump the max delay here to not skip over the activation. + neon_env_builder.pageserver_config_override = 'background_task_maximum_delay="20s"' env = neon_env_builder.init_start() @@ -70,7 +73,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # pageserver does if a compute node connects and sends a request for the tenant # while it's still in Loading state. (It waits for the loading to finish, and then # processes the request.) - tenant_load_delay_ms = 5000 + tenant_load_delay_ms = 15000 env.pageserver.stop() env.pageserver.start( extra_env_vars={"FAILPOINTS": f"before-attaching-tenant=return({tenant_load_delay_ms})"} @@ -157,7 +160,6 @@ def test_pageserver_chaos( pytest.skip("times out in debug builds") neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() if shard_count is not None: neon_env_builder.num_pageservers = shard_count diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 58d61eab0d..8746b88a75 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -2,10 +2,11 @@ import json import os import random import time -from typing import Any, Dict, Optional +from pathlib import Path +from typing import Any, Dict, Optional, Union import pytest -from fixtures.common_types import TenantId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver from fixtures.pageserver.common_types import parse_layer_file_name @@ -122,7 +123,12 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, "scheduling": "Stop", }, ) - env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*") + env.storage_controller.allowed_errors.extend( + [ + ".*Scheduling is disabled by policy Stop.*", + ".*Skipping reconcile for policy Stop.*", + ] + ) # We use a fixed seed to make the test reproducible: we want a randomly # chosen order, but not to change the order every time we run the test. @@ -385,6 +391,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): # (reproduce https://github.com/neondatabase/neon/issues/6802) pageserver_b.http_client().tenant_delete(tenant_id) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): """ @@ -429,6 +438,35 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): validate_heatmap(heatmap_second) +def list_elegible_layers( + pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId +) -> list[Path]: + """ + The subset of layer filenames that are elegible for secondary download: at time of writing this + is all resident layers which are also visible. + """ + candidates = pageserver.list_layers(tenant_id, timeline_id) + + layer_map = pageserver.http_client().layer_map_info(tenant_id, timeline_id) + + # Map of layer filenames to their visibility the "layer name" is not the same as the filename: add suffix to resolve one to the other + visible_map = dict( + (f"{layer.layer_file_name}-v1-00000001", layer.visible) + for layer in layer_map.historic_layers + ) + + def is_visible(layer_file_name): + try: + return visible_map[str(layer_file_name)] + except KeyError: + # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map + # matches what's on disk. + log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}") + raise + + return list(c for c in candidates if is_visible(c)) + + def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): """ Test the overall data flow in secondary mode: @@ -483,7 +521,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + assert list_elegible_layers(ps_attached, tenant_id, timeline_id) == ps_secondary.list_layers( tenant_id, timeline_id ) @@ -501,9 +539,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) try: - assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( - tenant_id, timeline_id - ) + assert list_elegible_layers( + ps_attached, tenant_id, timeline_id + ) == ps_secondary.list_layers(tenant_id, timeline_id) except: # Do a full listing of the secondary location on errors, to help debug of # https://github.com/neondatabase/neon/issues/6966 @@ -524,8 +562,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # ================================================================== try: log.info("Evicting a layer...") - layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0] - some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1] + layer_to_evict = list_elegible_layers(ps_attached, tenant_id, timeline_id)[0] + some_other_layer = list_elegible_layers(ps_attached, tenant_id, timeline_id)[1] log.info(f"Victim layer: {layer_to_evict.name}") ps_attached.http_client().evict_layer( tenant_id, timeline_id, layer_name=layer_to_evict.name @@ -543,9 +581,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id) - assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( - tenant_id, timeline_id - ) + assert list_elegible_layers( + ps_attached, tenant_id, timeline_id + ) == ps_secondary.list_layers(tenant_id, timeline_id) except: # On assertion failures, log some details to help with debugging heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) @@ -555,7 +593,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - env.storage_scrubber.scan_metadata() + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy # Detach secondary and delete tenant # =================================== @@ -584,6 +623,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) workload.stop() + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index d5b5ac3f75..6f7ea0092a 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -138,7 +138,6 @@ def test_pg_regress( neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count, @@ -202,7 +201,6 @@ def test_isolation( if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count ) @@ -265,7 +263,6 @@ def test_sql_regress( if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start( initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count ) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 09f941f582..2e5260ca78 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -12,7 +12,6 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, wait_for_last_flush_lsn, ) -from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( timeline_delete_wait_completed, @@ -313,6 +312,7 @@ def test_remote_storage_upload_queue_retries( def churn_while_failpoints_active(result): overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c") + # this call will wait for the failpoints to be turned off client.timeline_checkpoint(tenant_id, timeline_id) client.timeline_compact(tenant_id, timeline_id) overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d") @@ -332,8 +332,8 @@ def test_remote_storage_upload_queue_retries( # Exponential back-off in upload queue, so, gracious timeouts. wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0)) - wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2)) - wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0)) + wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1)) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # unblock churn operations configure_storage_sync_failpoints("off") @@ -769,11 +769,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv create_thread.join() -def test_compaction_waits_for_upload( +def test_paused_upload_stalls_checkpoint( neon_env_builder: NeonEnvBuilder, ): """ - This test forces a race between upload and compaction. + This test checks that checkpoints block on uploads to remote storage. """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) @@ -788,6 +788,10 @@ def test_compaction_waits_for_upload( } ) + env.pageserver.allowed_errors.append( + f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" + ) + tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -808,76 +812,9 @@ def test_compaction_waits_for_upload( endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)") wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - client.timeline_checkpoint(tenant_id, timeline_id) - deltas_at_first = len(client.layer_map_info(tenant_id, timeline_id).delta_layers()) - assert ( - deltas_at_first == 2 - ), "are you fixing #5863? just add one more checkpoint after 'CREATE TABLE bar ...' statement." - - endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)") - endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - - layers_before_last_checkpoint = client.layer_map_info(tenant_id, timeline_id).historic_by_name() - upload_stuck_layers = layers_before_last_checkpoint - layers_at_creation.historic_by_name() - - assert len(upload_stuck_layers) > 0 - - for name in upload_stuck_layers: - assert env.pageserver.layer_exists( - tenant_id, timeline_id, parse_layer_file_name(name) - ), "while uploads are stuck the layers should be present on disk" - - # now this will do the L0 => L1 compaction and want to remove - # upload_stuck_layers and the original initdb L0 - client.timeline_checkpoint(tenant_id, timeline_id) - - # as uploads are paused, the upload_stuck_layers should still be with us - for name in upload_stuck_layers: - assert env.pageserver.layer_exists( - tenant_id, timeline_id, parse_layer_file_name(name) - ), "uploads are stuck still over compaction" - - compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name() - overlap = compacted_layers.intersection(upload_stuck_layers) - assert len(overlap) == 0, "none of the L0's should remain after L0 => L1 compaction" - assert ( - len(compacted_layers) == 1 - ), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)" - - def layer_deletes_completed(): - m = client.get_metric_value("pageserver_layer_completed_deletes_total") - if m is None: - return 0 - return int(m) - - # if initdb created an initial delta layer, it might already be gc'd - # because it was uploaded before the failpoint was enabled. however, the - # deletion is not guaranteed to be complete. - assert layer_deletes_completed() <= 1 - - client.configure_failpoints(("before-upload-layer-pausable", "off")) - - # Ensure that this actually terminates - wait_upload_queue_empty(client, tenant_id, timeline_id) - - def until_layer_deletes_completed(): - deletes = layer_deletes_completed() - log.info(f"layer_deletes: {deletes}") - # ensure that initdb delta layer AND the previously stuck are now deleted - assert deletes >= len(upload_stuck_layers) + 1 - - wait_until(10, 1, until_layer_deletes_completed) - - for name in upload_stuck_layers: - assert not env.pageserver.layer_exists( - tenant_id, timeline_id, parse_layer_file_name(name) - ), "l0 should now be removed because of L0 => L1 compaction and completed uploads" - - # We should not have hit the error handling path in uploads where a uploaded file is gone - assert not env.pageserver.log_contains( - "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more." - ) + with pytest.raises(ReadTimeout): + client.timeline_checkpoint(tenant_id, timeline_id, timeout=5) + client.configure_failpoints(("before-upload-layer-pausable", "off")) def wait_upload_queue_empty( diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 90c6e26d01..1011a6fd22 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -47,9 +47,6 @@ def test_sharding_smoke( # Use S3-compatible remote storage so that we can scrub: this test validates # that the scrubber doesn't barf when it sees a sharded tenant. neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() - - neon_env_builder.preserve_database_files = True env = neon_env_builder.init_start( initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size @@ -127,8 +124,8 @@ def test_sharding_smoke( # Check the scrubber isn't confused by sharded content, then disable # it during teardown because we'll have deleted by then - env.storage_scrubber.scan_metadata() - neon_env_builder.scrub_on_exit = False + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy env.storage_controller.pageserver_api().tenant_delete(tenant_id) assert_prefix_empty( @@ -200,14 +197,15 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", "compaction_period": "0s", - # create image layers eagerly, so that GC can remove some layers - "image_creation_threshold": 1, + # Disable automatic creation of image layers, as we will create them explicitly when we want them + "image_creation_threshold": 9999, "image_layer_creation_check_threshold": 0, } neon_env_builder.storage_controller_config = { # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. - "max_unavailable": "300s" + "max_offline": "30s", + "max_warming_up": "300s", } env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) @@ -226,7 +224,7 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: # Do a full image layer generation before splitting, so that when we compact after splitting # we should only see sizes decrease (from post-split drops/rewrites), not increase (from image layer generation) - env.get_tenant_pageserver(tenant_id).http_client().timeline_compact( + env.get_tenant_pageserver(tenant_id).http_client().timeline_checkpoint( tenant_id, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True ) @@ -372,9 +370,6 @@ def test_sharding_split_smoke( # Use S3-compatible remote storage so that we can scrub: this test validates # that the scrubber doesn't barf when it sees a sharded tenant. neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() - - neon_env_builder.preserve_database_files = True non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024} diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 741f16685e..eb2cdccdb9 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -3,7 +3,7 @@ import threading import time from collections import defaultdict from datetime import datetime, timezone -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId @@ -12,6 +12,8 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + PageserverAvailability, + PageserverSchedulingPolicy, PgBin, StorageControllerApiException, TokenScope, @@ -918,6 +920,8 @@ def test_storage_controller_tenant_deletion( class Failure: pageserver_id: int + offline_timeout: int + must_detect_after: int def apply(self, env: NeonEnv): raise NotImplementedError() @@ -930,9 +934,11 @@ class Failure: class NodeStop(Failure): - def __init__(self, pageserver_ids, immediate): + def __init__(self, pageserver_ids, immediate, offline_timeout, must_detect_after): self.pageserver_ids = pageserver_ids self.immediate = immediate + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after def apply(self, env: NeonEnv): for ps_id in self.pageserver_ids: @@ -948,10 +954,42 @@ class NodeStop(Failure): return self.pageserver_ids +class NodeRestartWithSlowReattach(Failure): + def __init__(self, pageserver_id, offline_timeout, must_detect_after): + self.pageserver_id = pageserver_id + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after + self.thread = None + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.stop(immediate=False) + + def start_ps(): + pageserver.start( + extra_env_vars={"FAILPOINTS": "control-plane-client-re-attach=return(30000)"} + ) + + self.thread = threading.Thread(target=start_ps) + self.thread.start() + + def clear(self, env: NeonEnv): + if self.thread is not None: + self.thread.join() + + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints(("control-plane-client-re-attach", "off")) + + def nodes(self): + return [self.pageserver_id] + + class PageserverFailpoint(Failure): - def __init__(self, failpoint, pageserver_id): + def __init__(self, failpoint, pageserver_id, offline_timeout, must_detect_after): self.failpoint = failpoint self.pageserver_id = pageserver_id + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after def apply(self, env: NeonEnv): pageserver = env.get_pageserver(self.pageserver_id) @@ -987,15 +1025,28 @@ def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]: @pytest.mark.parametrize( "failure", [ - NodeStop(pageserver_ids=[1], immediate=False), - NodeStop(pageserver_ids=[1], immediate=True), - NodeStop(pageserver_ids=[1, 2], immediate=True), - PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"), + NodeStop(pageserver_ids=[1], immediate=False, offline_timeout=20, must_detect_after=5), + NodeStop(pageserver_ids=[1], immediate=True, offline_timeout=20, must_detect_after=5), + NodeStop(pageserver_ids=[1, 2], immediate=True, offline_timeout=20, must_detect_after=5), + PageserverFailpoint( + pageserver_id=1, + failpoint="get-utilization-http-handler", + offline_timeout=20, + must_detect_after=5, + ), + # Instrument a scenario where the node is slow to re-attach. The re-attach request itself + # should serve as a signal to the storage controller to use a more lenient heartbeat timeout. + NodeRestartWithSlowReattach(pageserver_id=1, offline_timeout=60, must_detect_after=15), ], ) def test_storage_controller_heartbeats( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure ): + neon_env_builder.storage_controller_config = { + "max_offline": "10s", + "max_warming_up": "20s", + } + neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_configs() env.start() @@ -1061,9 +1112,12 @@ def test_storage_controller_heartbeats( if node["id"] in offline_node_ids: assert node["availability"] == "Offline" - # A node is considered offline if the last successful heartbeat - # was more than 10 seconds ago (hardcoded in the storage controller). - wait_until(20, 1, nodes_offline) + start = time.time() + wait_until(failure.offline_timeout, 1, nodes_offline) + detected_after = time.time() - start + log.info(f"Detected node failures after {detected_after}s") + + assert detected_after >= failure.must_detect_after # .. expecting the tenant on the offline node to be migrated def tenant_migrated(): @@ -1546,7 +1600,13 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 ) - env.storage_controller.poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) shard_counts = get_node_shard_counts(env, tenant_ids) log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") @@ -1556,12 +1616,24 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): assert sum(shard_counts.values()) == total_shards ps.restart() - env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=10, backoff=1) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=10, + backoff=1, + ) env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 ) - env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=6, backoff=5) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=6, + backoff=5, + ) shard_counts = get_node_shard_counts(env, tenant_ids) log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") @@ -1606,11 +1678,23 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): backoff=2, ) - env.storage_controller.poll_node_status(ps_id_to_drain, "Draining", max_attempts=6, backoff=2) + env.storage_controller.poll_node_status( + ps_id_to_drain, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.DRAINING, + max_attempts=6, + backoff=2, + ) env.storage_controller.cancel_node_drain(ps_id_to_drain) - env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2) + env.storage_controller.poll_node_status( + ps_id_to_drain, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=6, + backoff=2, + ) @pytest.mark.parametrize("while_offline", [True, False]) @@ -1699,3 +1783,198 @@ def test_storage_controller_node_deletion( assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above. env.storage_controller.consistency_check() + + +@pytest.mark.parametrize("shard_count", [None, 2]) +def test_storage_controller_metadata_health( + neon_env_builder: NeonEnvBuilder, + shard_count: Optional[int], +): + """ + Create three tenants A, B, C. + + Phase 1: + - A: Post healthy status. + - B: Post unhealthy status. + - C: No updates. + + Phase 2: + - B: Post healthy status. + - C: Post healthy status. + + Phase 3: + - A: Post unhealthy status. + + Phase 4: + - Delete tenant A, metadata health status should be deleted as well. + """ + + def update_and_query_metadata_health( + env: NeonEnv, + healthy: List[TenantShardId], + unhealthy: List[TenantShardId], + outdated_duration: str = "1h", + ) -> Tuple[Set[str], Set[str]]: + """ + Update metadata health. Then list tenant shards with unhealthy and + outdated metadata health status. + """ + if healthy or unhealthy: + env.storage_controller.metadata_health_update(healthy, unhealthy) + result = env.storage_controller.metadata_health_list_unhealthy() + unhealthy_res = set(result["unhealthy_tenant_shards"]) + result = env.storage_controller.metadata_health_list_outdated(outdated_duration) + outdated_res = set(record["tenant_shard_id"] for record in result["health_records"]) + + return unhealthy_res, outdated_res + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + # Mock tenant (`initial_tenant``) with healthy scrubber scan result + tenant_a_shard_ids = ( + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=shard_count) + if shard_count is not None + else [TenantShardId(env.initial_tenant, 0, 0)] + ) + + # Mock tenant with unhealthy scrubber scan result + tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count) + tenant_b_shard_ids = ( + env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count) + if shard_count is not None + else [TenantShardId(tenant_b, 0, 0)] + ) + + # Mock tenant that never gets a health update from scrubber + tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count) + + tenant_c_shard_ids = ( + env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count) + if shard_count is not None + else [TenantShardId(tenant_c, 0, 0)] + ) + + # Metadata health table also updated as tenant shards are created. + assert env.storage_controller.metadata_health_is_healthy() + + # post "fake" updates to storage controller db + + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=tenant_a_shard_ids, unhealthy=tenant_b_shard_ids + ) + + log.info(f"After Phase 1: {unhealthy=}, {outdated=}") + assert len(unhealthy) == len(tenant_b_shard_ids) + for t in tenant_b_shard_ids: + assert str(t) in unhealthy + assert len(outdated) == 0 + + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=tenant_b_shard_ids + tenant_c_shard_ids, unhealthy=[] + ) + + log.info(f"After Phase 2: {unhealthy=}, {outdated=}") + assert len(unhealthy) == 0 + assert len(outdated) == 0 + + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=[], unhealthy=tenant_a_shard_ids + ) + + log.info(f"After Phase 3: {unhealthy=}, {outdated=}") + assert len(unhealthy) == len(tenant_a_shard_ids) + for t in tenant_a_shard_ids: + assert str(t) in unhealthy + assert len(outdated) == 0 + + # Phase 4: Delete A + env.storage_controller.pageserver_api().tenant_delete(env.initial_tenant) + + # A's unhealthy metadata health status should be deleted as well. + assert env.storage_controller.metadata_health_is_healthy() + + # All shards from B and C are not fresh if set outdated duration to 0 seconds. + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=[], unhealthy=tenant_a_shard_ids, outdated_duration="0s" + ) + assert len(unhealthy) == 0 + for t in tenant_b_shard_ids + tenant_c_shard_ids: + assert str(t) in outdated + + +def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): + """ + Test the `/control/v1/step_down` storage controller API. Upon receiving such + a request, the storage controller cancels any on-going reconciles and replies + with 503 to all requests apart from `/control/v1/step_down`, `/status` and `/metrics`. + """ + env = neon_env_builder.init_configs() + env.start() + + tid = TenantId.generate() + tsid = str(TenantShardId(tid, shard_number=0, shard_count=0)) + env.storage_controller.tenant_create(tid) + + env.storage_controller.reconcile_until_idle() + env.storage_controller.configure_failpoints(("sleep-on-reconcile-epilogue", "return(10000)")) + + # Make a change to the tenant config to trigger a slow reconcile + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None) + env.storage_controller.allowed_errors.append( + ".*Accepted configuration update but reconciliation failed.*" + ) + + observed_state = env.storage_controller.step_down() + log.info(f"Storage controller stepped down with {observed_state=}") + + # Validate that we waited for the slow reconcile to complete + # and updated the observed state in the storcon before stepping down. + node_id = str(env.pageserver.id) + assert tsid in observed_state + assert node_id in observed_state[tsid]["locations"] + assert "conf" in observed_state[tsid]["locations"][node_id] + assert "tenant_conf" in observed_state[tsid]["locations"][node_id]["conf"] + + tenant_conf = observed_state[tsid]["locations"][node_id]["conf"]["tenant_conf"] + assert "compaction_threshold" in tenant_conf + assert tenant_conf["compaction_threshold"] == 5 + + # Validate that we propagated the change to the pageserver + ps_tenant_conf = env.pageserver.http_client().tenant_config(tid) + assert "compaction_threshold" in ps_tenant_conf.effective_config + assert ps_tenant_conf.effective_config["compaction_threshold"] == 5 + + # Validate that the storcon is not replying to the usual requests + # once it has stepped down. + with pytest.raises(StorageControllerApiException, match="stepped_down"): + env.storage_controller.tenant_list() + + # Validate that we can step down multiple times and the observed state + # doesn't change. + observed_state_again = env.storage_controller.step_down() + assert observed_state == observed_state_again + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "leader"} + ) + == 0 + ) + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "stepped_down"} + ) + == 1 + ) + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "candidate"} + ) + == 0 + ) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 7c411a6b84..388f6a9e92 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -13,6 +13,7 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) +from fixtures.pg_version import PgVersion from fixtures.remote_storage import S3Storage, s3_storage from fixtures.utils import wait_until from fixtures.workload import Workload @@ -191,7 +192,9 @@ def test_scrubber_physical_gc_ancestors( "checkpoint_distance": f"{1024 * 1024}", "compaction_threshold": "1", "compaction_target_size": f"{1024 * 1024}", - "image_creation_threshold": "2", + # Disable automatic creation of image layers, as future image layers can result in layers in S3 that + # aren't referenced by children, earlier than the test expects such layers to exist + "image_creation_threshold": "9999", "image_layer_creation_check_threshold": "0", # Disable background compaction, we will do it explicitly "compaction_period": "0s", @@ -241,7 +244,7 @@ def test_scrubber_physical_gc_ancestors( workload.churn_rows(100) for shard in shards: ps = env.get_tenant_pageserver(shard) - ps.http_client().timeline_compact(shard, timeline_id) + ps.http_client().timeline_compact(shard, timeline_id, force_image_layer_creation=True) ps.http_client().timeline_gc(shard, timeline_id, 0) # We will use a min_age_secs=1 threshold for deletion, let it pass @@ -263,10 +266,85 @@ def test_scrubber_physical_gc_ancestors( # attach it, to drop any local state, then check it's still readable. workload.stop() drop_local_state(env, tenant_id) - workload.validate() +def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder): + """ + When we delete a timeline after a shard split, the child shards do not directly delete the + layers in the ancestor shards. They rely on the scrubber to clean up. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=None, + conf={ + # Small layers and low compaction thresholds, so that when we split we can expect some to + # be dropped by child shards + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + # Disable background compaction, we will do it explicitly + "compaction_period": "0s", + # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas + # and makes them GC'able + "pitr_interval": "0s", + }, + ) + + # Make sure the original shard has some layers + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + + new_shard_count = 4 + shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) + + # Create a second timeline so that when we delete the first one, child shards still have some content in S3. + # + # This is a limitation of the scrubber: if a shard isn't in S3 (because it has no timelines), then the scrubber + # doesn't know about it, and won't perceive its ancestors as ancestors. + other_timeline_id = TimelineId.generate() + env.storage_controller.pageserver_api().timeline_create( + PgVersion.NOT_SET, tenant_id, other_timeline_id + ) + + # Write after split so that child shards have some indices in S3 + workload.write_rows(100, upload=False) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + log.info(f"Waiting for shard {shard} on pageserver {ps.id}") + ps.http_client().timeline_checkpoint( + shard, timeline_id, compact=False, wait_until_uploaded=True + ) + + # The timeline still exists in child shards and they reference its layers, so scrubbing + # now shouldn't delete anything. + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] == 0 + + # Delete the timeline + env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id) + + # Subsequently doing physical GC should clean up the ancestor layers + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] > 0 + + def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): """ Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards @@ -438,9 +516,10 @@ def test_scrubber_scan_pageserver_metadata( assert len(index.layer_metadata) > 0 it = iter(index.layer_metadata.items()) - scan_summary = env.storage_scrubber.scan_metadata() - assert not scan_summary["with_warnings"] - assert not scan_summary["with_errors"] + healthy, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + assert healthy + + assert env.storage_controller.metadata_health_is_healthy() # Delete a layer file that is listed in the index. layer, metadata = next(it) @@ -451,7 +530,19 @@ def test_scrubber_scan_pageserver_metadata( ) log.info(f"delete response: {delete_response}") - # Check scan summary. Expect it to be a L0 layer so only emit warnings. - scan_summary = env.storage_scrubber.scan_metadata() + # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings. + _, scan_summary = env.storage_scrubber.scan_metadata() log.info(f"{pprint.pformat(scan_summary)}") assert len(scan_summary["with_warnings"]) > 0 + + assert env.storage_controller.metadata_health_is_healthy() + + # Now post to storage controller, expect seeing one unhealthy health record + _, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + log.info(f"{pprint.pformat(scan_summary)}") + assert len(scan_summary["with_warnings"]) > 0 + + unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"] + assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id) + + neon_env_builder.disable_scrub_on_exit() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 6d20b3d0de..c01b3a2e89 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -315,6 +315,9 @@ def test_tenant_delete_races_timeline_creation( # Zero tenants remain (we deleted the default tenant) assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): """ @@ -338,13 +341,13 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) env.stop() - result = env.storage_scrubber.scan_metadata() - assert result["with_warnings"] == [] + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy env.start() ps_http = env.pageserver.http_client() ps_http.tenant_delete(tenant_id) env.stop() - env.storage_scrubber.scan_metadata() - assert result["with_warnings"] == [] + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index b62398d427..840c7159ad 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -48,13 +48,12 @@ def test_threshold_based_eviction( tenant_id, timeline_id = env.initial_tenant, env.initial_timeline ps_http = env.pageserver.http_client() - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { - "kind": "NoEviction" - } + vps_http = env.storage_controller.pageserver_api() + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] is None eviction_threshold = 10 eviction_period = 2 - ps_http.set_tenant_config( + vps_http.set_tenant_config( tenant_id, { "eviction_policy": { @@ -64,7 +63,7 @@ def test_threshold_based_eviction( }, }, ) - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", "threshold": f"{eviction_threshold}s", "period": f"{eviction_period}s", @@ -73,7 +72,7 @@ def test_threshold_based_eviction( # restart because changing tenant config is not instant env.pageserver.restart() - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", "threshold": f"{eviction_threshold}s", "period": f"{eviction_period}s", @@ -81,7 +80,7 @@ def test_threshold_based_eviction( # create a bunch of L1s, only the least of which will need to be resident compaction_threshold = 3 # create L1 layers quickly - ps_http.patch_tenant_config_client_side( + vps_http.patch_tenant_config_client_side( tenant_id, inserts={ # Disable gc and compaction to avoid on-demand downloads from their side. @@ -154,7 +153,7 @@ def test_threshold_based_eviction( while time.time() - started_waiting_at < observation_window: current = ( time.time(), - MapInfoProjection(ps_http.layer_map_info(tenant_id, timeline_id)), + MapInfoProjection(vps_http.layer_map_info(tenant_id, timeline_id)), ) last = map_info_changes[-1] if map_info_changes else (0, None) if last[1] is None or current[1] != last[1]: diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index da37f469b3..6d96dda391 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -485,6 +485,9 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage), ) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + @pytest.mark.parametrize( "stuck_failpoint", @@ -703,6 +706,9 @@ def test_timeline_delete_works_for_remote_smoke( # Assume it is mock server inconsistency and check twice. wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage)) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_delete_orphaned_objects( neon_env_builder: NeonEnvBuilder, diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 38f8dfa885..b3767a2766 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -165,7 +165,7 @@ def test_ancestor_detach_branched_from( ) all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert all_reparented == [] + assert all_reparented == set() if restart_after: env.pageserver.stop() @@ -534,7 +534,7 @@ def test_compaction_induced_by_detaches_in_history( for _, timeline_id in skip_main: reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert reparented == [], "we have no earlier branches at any level" + assert reparented == set(), "we have no earlier branches at any level" post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total" @@ -774,7 +774,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): else: break - assert reparented == [], "too many retries (None) or unexpected reparentings" + assert reparented == set(), "too many retries (None) or unexpected reparentings" for shard_info in shards: node_id = int(shard_info["node_id"]) diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py new file mode 100644 index 0000000000..24de894687 --- /dev/null +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -0,0 +1,67 @@ +import time + +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) +from fixtures.pageserver.utils import wait_timeline_detail_404 + + +def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start( + initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"} + ) + ps = env.pageserver + http = ps.http_client() + + foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant) + + gc_active_line = ".* gc_loop.*: [12] timelines need GC" + gc_skipped_line = ".* gc_loop.*: Skipping GC: .*" + init_gc_skipped = ".*: initialized with gc blocked.*" + + tenant_before = http.tenant_status(env.initial_tenant) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line) + + assert ps.log_contains(gc_skipped_line, offset) is None + + http.timeline_block_gc(env.initial_tenant, foo_branch) + + tenant_after = http.tenant_status(env.initial_tenant) + assert tenant_before != tenant_after + gc_blocking = tenant_after["gc_blocking"] + assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }" + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + ps.restart() + ps.quiesce_tenants() + + _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + # deletion unblocks gc + http.timeline_delete(env.initial_tenant, foo_branch) + wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line, offset) + + http.timeline_block_gc(env.initial_tenant, env.initial_timeline) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + # removing the manual block also unblocks gc + http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line, offset) + + +def wait_for_another_gc_round(): + time.sleep(2) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 5e9a42f6b4..1f220eec9e 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -936,6 +936,9 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline + # just make sure this doesn't hit an assertion + client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True) + # load in some data endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) endpoint.safe_psql_many( diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index dbd0e6428b..7bbe834c8c 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit dbd0e6428b9274d72a10ac29bd3e3162faf109d4 +Subproject commit 7bbe834c8c2dc37802eca8484311599bc47341f6 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 035b73a9c5..9eba7dd382 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 035b73a9c5998f9a0ef35cc8df1bae680bf770fc +Subproject commit 9eba7dd382606ffca43aca865f337ec21bcdac73 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index b39f316137..5377f5ed72 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit b39f316137fdd29e2da15d2af2fdd1cfd18163be +Subproject commit 5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3 diff --git a/vendor/revisions.json b/vendor/revisions.json index eeebd646f5..570dfc1550 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "v16": ["16.3", "b39f316137fdd29e2da15d2af2fdd1cfd18163be"], - "v15": ["15.7", "035b73a9c5998f9a0ef35cc8df1bae680bf770fc"], - "v14": ["14.12", "dbd0e6428b9274d72a10ac29bd3e3162faf109d4"] + "v16": ["16.3", "5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3"], + "v15": ["15.7", "9eba7dd382606ffca43aca865f337ec21bcdac73"], + "v14": ["14.12", "7bbe834c8c2dc37802eca8484311599bc47341f6"] } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 224e9847f3..7d005c7139 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -277,8 +277,12 @@ files: help: 'Bytes between received and replayed LSN' key_labels: values: [replication_delay_bytes] + # We use a GREATEST call here because this calculation can be negative. + # The calculation is not atomic, meaning after we've gotten the receive + # LSN, the replay LSN may have advanced past the receive LSN we + # are using for the calculation. query: | - SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes; + SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; - metric_name: replication_delay_seconds type: gauge @@ -404,7 +408,7 @@ files: x::text as duration_seconds, neon.approximate_working_set_size_seconds(x) as size from - (select generate_series * 60 as x from generate_series(1, 60)); + (select generate_series * 60 as x from generate_series(1, 60)) as t (x); build: | # Build cgroup-tools #