diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 37983798b7..a5282876d0 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -8,6 +8,9 @@ self-hosted-runner: - small-arm64 - us-east-2 config-variables: + - BENCHMARK_PROJECT_ID_PUB + - BENCHMARK_PROJECT_ID_SUB - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_UPCOMING_RELEASE_CHANNEL_ID + - DEV_AWS_OIDC_ROLE_ARN diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index f7ea534fb9..6f80d6e431 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -56,6 +56,10 @@ concurrency: jobs: bench: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners strategy: fail-fast: false matrix: @@ -63,9 +67,13 @@ jobs: - DEFAULT_PG_VERSION: 16 PLATFORM: "neon-staging" region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + RUNNER: [ self-hosted, us-east-2, x64 ] + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned - DEFAULT_PG_VERSION: 16 PLATFORM: "azure-staging" region_id: 'azure-eastus2' + RUNNER: [ self-hosted, eastus2, x64 ] + IMAGE: neondatabase/build-tools:pinned env: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" @@ -76,14 +84,21 @@ jobs: SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.PLATFORM }} - runs-on: [ self-hosted, us-east-2, x64 ] + runs-on: ${{ matrix.RUNNER }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: ${{ matrix.IMAGE }} options: --init steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials # necessary on Azure runners + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -147,7 +162,7 @@ jobs: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -161,6 +176,7 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -168,7 +184,7 @@ jobs: path: /tmp/neon/ prefix: latest - - name: Run benchmark + - name: Run Logical Replication benchmarks uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} @@ -176,12 +192,15 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }} + BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }} - - name: Run benchmark + - name: Run Physical Replication benchmarks uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} @@ -234,6 +253,9 @@ jobs: id: pgbench-compare-matrix run: | region_id_default=${{ env.DEFAULT_REGION_ID }} + runner_default='["self-hosted", "us-east-2", "x64"]' + runner_azure='["self-hosted", "eastus2", "x64"]' + image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned" matrix='{ "pg_version" : [ 16 @@ -247,16 +269,19 @@ jobs: "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] + "runner": ['"$runner_default"'], + "image": [ "'"$image_default"'" ], + "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' if [ "$(date +%A)" = "Saturday" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]') + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -299,6 +324,10 @@ jobs: pgbench-compare: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} needs: [ generate-matrices ] + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners strategy: fail-fast: false @@ -314,9 +343,9 @@ jobs: SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} - runs-on: [ self-hosted, us-east-2, x64 ] + runs-on: ${{ matrix.runner }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: ${{ matrix.image }} options: --init # Increase timeout to 8h, default timeout is 6h @@ -325,6 +354,13 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials # necessary on Azure runners + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -432,12 +468,20 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} pgbench-pgvector: + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners strategy: fail-fast: false matrix: include: - PLATFORM: "neonvm-captest-pgvector" + RUNNER: [ self-hosted, us-east-2, x64 ] + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned - PLATFORM: "azure-captest-pgvector" + RUNNER: [ self-hosted, eastus2, x64 ] + IMAGE: neondatabase/build-tools:pinned env: TEST_PG_BENCH_DURATIONS_MATRIX: "15m" @@ -450,9 +494,9 @@ jobs: SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.PLATFORM }} - runs-on: [ self-hosted, us-east-2, x64 ] + runs-on: ${{ matrix.RUNNER }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: ${{ matrix.IMAGE }} options: --init steps: @@ -463,12 +507,12 @@ jobs: - name: Install postgresql-16 where pytest expects it run: | cd /home/nonroot - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb - dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg - dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg - dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb + dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg mkdir -p /tmp/neon/pg_install/v16/bin ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql @@ -493,6 +537,13 @@ jobs: esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3 + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours - name: Benchmark pgvector hnsw indexing uses: ./.github/actions/run-python-test-set @@ -521,7 +572,7 @@ jobs: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - + - name: Create Allure report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index d4870e16ad..2ee66cfdc1 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -149,8 +149,6 @@ jobs: env: BUILD_TYPE: release - # remove the cachepot wrapper and build without crate caches - RUSTC_WRAPPER: "" # build with incremental compilation produce partial results # so do not attempt to cache this build, also disable the incremental compilation CARGO_INCREMENTAL: 0 diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index 55b68ccdb5..23a2e3876c 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -66,7 +66,31 @@ jobs: ports: - 9000:9000 - 8123:8123 - + zookeeper: + image: quay.io/debezium/zookeeper:2.7 + ports: + - 2181:2181 + kafka: + image: quay.io/debezium/kafka:2.7 + env: + ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + ports: + - 9092:9092 + debezium: + image: quay.io/debezium/connect:2.7 + env: + BOOTSTRAP_SERVERS: kafka:9092 + GROUP_ID: 1 + CONFIG_STORAGE_TOPIC: debezium-config + OFFSET_STORAGE_TOPIC: debezium-offset + STATUS_STORAGE_TOPIC: debezium-status + DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector + ports: + - 8083:8083 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index 024594532f..2e79498fc4 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -7,12 +7,20 @@ on: description: 'Source tag' required: true type: string + force: + description: 'Force the image to be pinned' + default: false + type: boolean workflow_call: inputs: from-tag: description: 'Source tag' required: true type: string + force: + description: 'Force the image to be pinned' + default: false + type: boolean defaults: run: @@ -22,15 +30,18 @@ concurrency: group: pin-build-tools-image-${{ inputs.from-tag }} cancel-in-progress: false +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} -jobs: - tag-image: - runs-on: ubuntu-22.04 +env: + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: pinned - env: - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: pinned +jobs: + check-manifests: + runs-on: ubuntu-22.04 + outputs: + skip: ${{ steps.check-manifests.outputs.skip }} steps: - name: Check if we really need to pin the image @@ -47,27 +58,44 @@ jobs: echo "skip=${skip}" | tee -a $GITHUB_OUTPUT + tag-image: + needs: check-manifests + + # use format(..) to catch both inputs.force = true AND inputs.force = 'true' + if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true' + + runs-on: ubuntu-22.04 + + permissions: + id-token: write # for `azure/login` + + steps: - uses: docker/login-action@v3 - if: steps.check-manifests.outputs.skip == 'false' + with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub - if: steps.check-manifests.outputs.skip == 'false' - run: | - docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \ - neondatabase/build-tools:${FROM_TAG} - - uses: docker/login-action@v3 - if: steps.check-manifests.outputs.skip == 'false' with: registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com username: ${{ secrets.AWS_ACCESS_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR - if: steps.check-manifests.outputs.skip == 'false' + - name: Azure login + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} + + - name: Login to ACR + run: | + az acr login --name=neoneastus2 + + - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ + -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \ + -t neondatabase/build-tools:${TO_TAG} \ neondatabase/build-tools:${FROM_TAG} diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 77928a343e..6fbe785c56 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -13,8 +13,6 @@ defaults: env: # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} jobs: cancel-previous-e2e-tests: @@ -64,19 +62,35 @@ jobs: needs: [ tag ] runs-on: ubuntu-22.04 env: + EVENT_ACTION: ${{ github.event.action }} + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} TAG: ${{ needs.tag.outputs.build-tag }} steps: - - name: check if ecr image are present - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + - name: Wait for `promote-images` job to finish + # It's important to have a timeout here, the script in the step can run infinitely + timeout-minutes: 60 run: | - for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do - OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text) - if [ "$OUTPUT" == "" ]; then - echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT - exit 1 - fi + if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then + exit 0 + fi + + # For PRs we use the run id as the tag + BUILD_AND_TEST_RUN_ID=${TAG} + while true; do + conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion') + case "$conclusion" in + success) + break + ;; + failure | cancelled | skipped) + echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..." + exit 1 + ;; + *) + echo "The 'promote-images' hasn't succeed yet. Waiting..." + sleep 60 + ;; + esac done - name: Set e2e-platforms diff --git a/Cargo.lock b/Cargo.lock index 764c0fbd30..031fae0f37 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3960,7 +3960,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -3973,7 +3973,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "base64 0.20.0", "byteorder", @@ -3992,7 +3992,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -4324,6 +4324,7 @@ dependencies = [ "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", + "try-lock", "typed-json", "url", "urlencoding", @@ -6186,7 +6187,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "async-trait", "byteorder", @@ -6563,9 +6564,9 @@ dependencies = [ [[package]] name = "try-lock" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "tungstenite" diff --git a/Cargo.toml b/Cargo.toml index af1c1dfc82..963841e340 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -184,6 +184,7 @@ tracing = "0.1" tracing-error = "0.2.0" tracing-opentelemetry = "0.21.0" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } +try-lock = "0.2.5" twox-hash = { version = "1.6.3", default-features = false } typed-json = "0.1" url = "2.2" diff --git a/Dockerfile b/Dockerfile index ace112cccf..ceb1c7cb55 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh -ENV BUILD_TYPE release +ENV BUILD_TYPE=release RUN set -e \ && mold -run make -j $(nproc) -s neon-pg-ext \ && rm -rf pg_install/build \ @@ -29,24 +29,12 @@ WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG -# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. -# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. -# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build -ARG RUSTC_WRAPPER=cachepot -ENV AWS_REGION=eu-central-1 -ENV CACHEPOT_S3_KEY_PREFIX=cachepot -ARG CACHEPOT_BUCKET=neon-github-dev -#ARG AWS_ACCESS_KEY_ID -#ARG AWS_SECRET_ACCESS_KEY - COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib COPY --chown=nonroot . . -# Show build caching stats to check if it was used in the end. -# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \ --bin pg_sni_router \ @@ -58,8 +46,7 @@ RUN set -e \ --bin proxy \ --bin neon_local \ --bin storage_scrubber \ - --locked --release \ - && cachepot -s + --locked --release # Build final image # @@ -104,7 +91,7 @@ RUN mkdir -p /data/.neon/ && \ # When running a binary that links with libpq, default to using our most recent postgres version. Binaries # that want a particular postgres version will select it explicitly: this is just a default. -ENV LD_LIBRARY_PATH /usr/local/v16/lib +ENV LD_LIBRARY_PATH=/usr/local/v16/lib VOLUME ["/data"] @@ -112,5 +99,5 @@ USER neon EXPOSE 6400 EXPOSE 9898 -CMD /usr/local/bin/pageserver -D /data/.neon +CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"] diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index dfaab1cb2e..d6beb61369 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -58,7 +58,7 @@ RUN set -e \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* # protobuf-compiler (protoc) -ENV PROTOC_VERSION 25.1 +ENV PROTOC_VERSION=25.1 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ && unzip -q protoc.zip -d protoc \ && mv protoc/bin/protoc /usr/local/bin/protoc \ @@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws && rm awscliv2.zip # Mold: A Modern Linker -ENV MOLD_VERSION v2.31.0 +ENV MOLD_VERSION=v2.33.0 RUN set -e \ && git clone https://github.com/rui314/mold.git \ && mkdir mold/build \ @@ -168,7 +168,7 @@ USER nonroot:nonroot WORKDIR /home/nonroot # Python -ENV PYTHON_VERSION=3.9.18 \ +ENV PYTHON_VERSION=3.9.19 \ PYENV_ROOT=/home/nonroot/.pyenv \ PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH RUN set -e \ @@ -192,9 +192,14 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.80.0 +ENV RUSTC_VERSION=1.80.1 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" +ARG RUSTFILT_VERSION=0.2.1 +ARG CARGO_HAKARI_VERSION=0.9.30 +ARG CARGO_DENY_VERSION=0.16.1 +ARG CARGO_HACK_VERSION=0.6.31 +ARG CARGO_NEXTEST_VERSION=0.9.72 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ chmod +x rustup-init && \ ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ @@ -203,15 +208,13 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux . "$HOME/.cargo/env" && \ cargo --version && rustup --version && \ rustup component add llvm-tools-preview rustfmt clippy && \ - cargo install --git https://github.com/paritytech/cachepot && \ - cargo install rustfilt && \ - cargo install cargo-hakari && \ - cargo install cargo-deny --locked && \ - cargo install cargo-hack && \ - cargo install cargo-nextest && \ + cargo install rustfilt --version ${RUSTFILT_VERSION} && \ + cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \ + cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ + cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \ + cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \ rm -rf /home/nonroot/.cargo/registry && \ rm -rf /home/nonroot/.cargo/git -ENV RUSTC_WRAPPER=cachepot # Show versions RUN whoami \ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 5e53a55316..7acaf2f2fd 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -94,7 +94,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ make clean && cp -R /sfcgal/* / -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ @@ -411,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ @@ -444,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in \ "v14") \ @@ -480,7 +480,7 @@ RUN case "${PG_VERSION}" in \ FROM build-deps AS pg-cron-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ @@ -506,7 +506,7 @@ RUN apt-get update && \ libboost-system1.74-dev \ libeigen3-dev -ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ @@ -546,7 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. FROM build-deps AS pg-uuidv7-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ @@ -563,7 +563,7 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz FROM build-deps AS pg-roaringbitmap-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ @@ -580,7 +580,7 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 FROM build-deps AS pg-semver-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ @@ -598,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ @@ -622,7 +622,7 @@ RUN case "${PG_VERSION}" in \ FROM build-deps AS pg-anon-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ @@ -750,7 +750,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz - FROM build-deps AS wal2json-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ @@ -766,7 +766,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. FROM build-deps AS pg-ivm-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ @@ -783,7 +783,7 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv FROM build-deps AS pg-partman-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ @@ -933,7 +933,8 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src -#COPY --from=rum-pg-build /rum.tar.gz /ext-src +COPY --from=rum-pg-build /rum.tar.gz /ext-src +COPY patches/rum.patch /ext-src #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src COPY --from=prefix-pg-build /prefix.tar.gz /ext-src @@ -945,7 +946,7 @@ COPY patches/pg_hintplan.patch /ext-src COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src COPY patches/pg_cron.patch /ext-src #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src -COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src +#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src @@ -960,6 +961,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \ rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ || exit 1; rm -f $f; done RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch +RUN cd /ext-src/rum-src && patch -p1 <../rum.patch # cmake is required for the h3 test RUN apt-get update && apt-get install -y cmake RUN patch -p1 < /ext-src/pg_hintplan.patch @@ -1032,6 +1034,6 @@ RUN apt update && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 -ENV LANG en_US.utf8 +ENV LANG=en_US.utf8 USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 505d157efd..15bbac702f 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -158,6 +158,8 @@ pub struct NeonStorageControllerConf { /// Threshold for auto-splitting a tenant into shards pub split_threshold: Option, + + pub max_secondary_lag_bytes: Option, } impl NeonStorageControllerConf { @@ -173,6 +175,7 @@ impl Default for NeonStorageControllerConf { max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL, max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL, split_threshold: None, + max_secondary_lag_bytes: None, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index e054e9ee57..f180e922e8 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -383,6 +383,10 @@ impl StorageController { args.push(format!("--split-threshold={split_threshold}")) } + if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() { + args.push(format!("--max-secondary-lag-bytes={lag}")) + } + args.push(format!( "--neon-local-repo-dir={}", self.env.base_data_dir.display() diff --git a/deny.toml b/deny.toml index 469609c496..dc985138e6 100644 --- a/deny.toml +++ b/deny.toml @@ -4,6 +4,7 @@ # to your expectations and requirements. # Root options +[graph] targets = [ { triple = "x86_64-unknown-linux-gnu" }, { triple = "aarch64-unknown-linux-gnu" }, @@ -12,6 +13,7 @@ targets = [ ] all-features = false no-default-features = false +[output] feature-depth = 1 # This section is considered when running `cargo deny check advisories` @@ -19,17 +21,13 @@ feature-depth = 1 # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html [advisories] db-urls = ["https://github.com/rustsec/advisory-db"] -vulnerability = "deny" -unmaintained = "warn" yanked = "warn" -notice = "warn" ignore = [] # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html [licenses] -unlicensed = "deny" allow = [ "Apache-2.0", "Artistic-2.0", @@ -42,10 +40,6 @@ allow = [ "OpenSSL", "Unicode-DFS-2016", ] -deny = [] -copyleft = "warn" -allow-osi-fsf-free = "neither" -default = "deny" confidence-threshold = 0.8 exceptions = [ # Zlib license has some restrictions if we decide to change sth diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index a00591afd0..10805a9952 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -78,7 +78,7 @@ for pg_version in 14 15 16; do docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/ rm -rf $TMPDIR # We are running tests now - if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ + if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt then cleanup diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh index c05fc159aa..58b2581197 100644 --- a/docker-compose/run-tests.sh +++ b/docker-compose/run-tests.sh @@ -1,15 +1,15 @@ #!/bin/bash set -x -cd /ext-src +cd /ext-src || exit 2 FAILED= -LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u) +LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u) for d in ${LIST} do - [ -d ${d} ] || continue + [ -d "${d}" ] || continue psql -c "select 1" >/dev/null || break - make -C ${d} installcheck || FAILED="${d} ${FAILED}" + USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" done [ -z "${FAILED}" ] && exit 0 -echo ${FAILED} +echo "${FAILED}" exit 1 \ No newline at end of file diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index b275349168..5fd4080c28 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -1,13 +1,18 @@ # Summary +# Looking for `neon.tech` docs? + +This page linkes to a selection of technical content about the open source code in this repository. + +Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code +in this repository. + +# Architecture + [Introduction]() - [Separation of Compute and Storage](./separation-compute-storage.md) -# Architecture - - [Compute]() - - [WAL proposer]() - - [WAL Backpressure]() - [Postgres changes](./core_changes.md) - [Pageserver](./pageserver.md) @@ -16,33 +21,15 @@ - [WAL Redo](./pageserver-walredo.md) - [Page cache](./pageserver-pagecache.md) - [Storage](./pageserver-storage.md) - - [Datadir mapping]() - - [Layer files]() - - [Branching]() - - [Garbage collection]() - - [Cloud Storage]() - [Processing a GetPage request](./pageserver-processing-getpage.md) - [Processing WAL](./pageserver-processing-wal.md) - - [Management API]() - - [Tenant Rebalancing]() - [WAL Service](walservice.md) - [Consensus protocol](safekeeper-protocol.md) - - [Management API]() - - [Rebalancing]() - -- [Control Plane]() - -- [Proxy]() - [Source view](./sourcetree.md) - [docker.md](./docker.md) — Docker images and building pipeline. - [Error handling and logging](./error-handling.md) - - [Testing]() - - [Unit testing]() - - [Integration testing]() - - [Benchmarks]() - - [Glossary](./glossary.md) @@ -58,28 +45,6 @@ # RFCs -- [RFCs](./rfcs/README.md) - -- [002-storage](rfcs/002-storage.md) -- [003-laptop-cli](rfcs/003-laptop-cli.md) -- [004-durability](rfcs/004-durability.md) -- [005-zenith_local](rfcs/005-zenith_local.md) -- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md) -- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md) -- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md) -- [008-push-pull](rfcs/008-push-pull.md) -- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md) -- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md) -- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md) -- [010-storage_details](rfcs/010-storage_details.md) -- [011-retention-policy](rfcs/011-retention-policy.md) -- [012-background-tasks](rfcs/012-background-tasks.md) -- [013-term-history](rfcs/013-term-history.md) -- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md) -- [014-storage-lsm](rfcs/014-storage-lsm.md) -- [015-storage-messaging](rfcs/015-storage-messaging.md) -- [016-connection-routing](rfcs/016-connection-routing.md) -- [017-timeline-data-management](rfcs/017-timeline-data-management.md) -- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md) -- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md) -- [cluster-size-limits](rfcs/cluster-size-limits.md) +Major changes are documented in RFCS: +- See [RFCs](./rfcs/README.md) for more information +- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 0acd83753e..3af3f74e9c 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -107,7 +107,10 @@ impl Key { /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 pub fn to_i128(&self) -> i128 { - assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); + assert!( + self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222, + "invalid key: {self}", + ); (((self.field1 & 0x7F) as i128) << 120) | (((self.field2 & 0xFFFF) as i128) << 104) | ((self.field3 as i128) << 72) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 591c45d908..ab4adfbebe 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -637,6 +637,13 @@ pub struct TenantInfo { pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub attachment_status: TenantAttachmentStatus, pub generation: u32, + + /// Opaque explanation if gc is being blocked. + /// + /// Only looked up for the individual tenant detail, not the listing. This is purely for + /// debugging, not included in openapi. + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_blocking: Option, } #[derive(Serialize, Deserialize, Clone)] @@ -940,6 +947,8 @@ pub struct TopTenantShardsResponse { } pub mod virtual_file { + use std::path::PathBuf; + #[derive( Copy, Clone, @@ -958,6 +967,53 @@ pub mod virtual_file { #[cfg(target_os = "linux")] TokioEpollUring, } + + /// Direct IO modes for a pageserver. + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] + pub enum DirectIoMode { + /// Direct IO disabled (uses usual buffered IO). + #[default] + Disabled, + /// Direct IO disabled (performs checks and perf simulations). + Evaluate { + /// Alignment check level + alignment_check: DirectIoAlignmentCheckLevel, + /// Latency padded for performance simulation. + latency_padding: DirectIoLatencyPadding, + }, + /// Direct IO enabled. + Enabled { + /// Actions to perform on alignment error. + on_alignment_error: DirectIoOnAlignmentErrorAction, + }, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(rename_all = "kebab-case")] + pub enum DirectIoAlignmentCheckLevel { + #[default] + Error, + Log, + None, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(rename_all = "kebab-case")] + pub enum DirectIoOnAlignmentErrorAction { + Error, + #[default] + FallbackToBuffered, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(tag = "type", rename_all = "kebab-case")] + pub enum DirectIoLatencyPadding { + /// Pad virtual file operations with IO to a fake file. + FakeFileRW { path: PathBuf }, + #[default] + None, + } } // Wrapped in libpq CopyData @@ -1427,6 +1483,7 @@ mod tests { current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, generation: 1, + gc_blocking: None, }; let expected_active = json!({ "id": original_active.id.to_string(), @@ -1449,6 +1506,7 @@ mod tests { current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, generation: 1, + gc_blocking: None, }; let expected_broken = json!({ "id": original_broken.id.to_string(), diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs index ae5a21bab9..ad74d343ae 100644 --- a/libs/pageserver_api/src/models/detach_ancestor.rs +++ b/libs/pageserver_api/src/models/detach_ancestor.rs @@ -1,6 +1,8 @@ +use std::collections::HashSet; + use utils::id::TimelineId; #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)] pub struct AncestorDetached { - pub reparented_timelines: Vec, + pub reparented_timelines: HashSet, } diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index fdabcbacb2..9f57f3d507 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -144,7 +144,20 @@ impl PgConnectionConfig { // implement and this function is hardly a bottleneck. The function is only called around // establishing a new connection. #[allow(unstable_name_collisions)] - config.options(&encode_options(&self.options)); + config.options( + &self + .options + .iter() + .map(|s| { + if s.contains(['\\', ' ']) { + Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ ")) + } else { + Cow::Borrowed(s.as_str()) + } + }) + .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized + .collect::(), + ); } config } @@ -165,21 +178,6 @@ impl PgConnectionConfig { } } -#[allow(unstable_name_collisions)] -fn encode_options(options: &[String]) -> String { - options - .iter() - .map(|s| { - if s.contains(['\\', ' ']) { - Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ ")) - } else { - Cow::Borrowed(s.as_str()) - } - }) - .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized - .collect::() -} - impl fmt::Display for PgConnectionConfig { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // The password is intentionally hidden and not part of this display string. @@ -208,7 +206,7 @@ impl fmt::Debug for PgConnectionConfig { #[cfg(test)] mod tests_pg_connection_config { - use crate::{encode_options, PgConnectionConfig}; + use crate::PgConnectionConfig; use once_cell::sync::Lazy; use url::Host; @@ -257,12 +255,18 @@ mod tests_pg_connection_config { #[test] fn test_with_options() { - let options = encode_options(&[ - "hello".to_owned(), - "world".to_owned(), - "with space".to_owned(), - "and \\ backslashes".to_owned(), + let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([ + "hello", + "world", + "with space", + "and \\ backslashes", ]); - assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes"); + assert_eq!(cfg.host(), &*STUB_HOST); + assert_eq!(cfg.port(), 123); + assert_eq!(cfg.raw_address(), "stub.host.example:123"); + assert_eq!( + cfg.to_tokio_postgres_config().get_options(), + Some("hello world with\\ space and\\ \\\\\\ backslashes") + ); } } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index a46d68ef33..f4fc0ba57b 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -128,7 +128,7 @@ pub mod circuit_breaker; /// /// ############################################################################################# /// TODO this macro is not the way the library is intended to be used, see for details. -/// We use `cachepot` to reduce our current CI build times: +/// We used `cachepot` to reduce our current CI build times: /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation. /// The problem needs further investigation and regular `const` declaration instead of a macro. diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 156b99a010..16ec563fa7 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -78,8 +78,9 @@ impl Drop for GateGuard { } } -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum GateError { + #[error("gate is closed")] GateClosed, } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 43976250a4..0e748ee3db 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -108,3 +108,7 @@ harness = false [[bench]] name = "bench_walredo" harness = false + +[[bench]] +name = "bench_ingest" +harness = false diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs new file mode 100644 index 0000000000..9bab02e46c --- /dev/null +++ b/pageserver/benches/bench_ingest.rs @@ -0,0 +1,239 @@ +use std::{env, num::NonZeroUsize}; + +use bytes::Bytes; +use camino::Utf8PathBuf; +use criterion::{criterion_group, criterion_main, Criterion}; +use pageserver::{ + config::PageServerConf, + context::{DownloadBehavior, RequestContext}, + l0_flush::{L0FlushConfig, L0FlushGlobalState}, + page_cache, + repository::Value, + task_mgr::TaskKind, + tenant::storage_layer::InMemoryLayer, + virtual_file, +}; +use pageserver_api::{key::Key, shard::TenantShardId}; +use utils::{ + bin_ser::BeSer, + id::{TenantId, TimelineId}, +}; + +// A very cheap hash for generating non-sequential keys. +fn murmurhash32(mut h: u32) -> u32 { + h ^= h >> 16; + h = h.wrapping_mul(0x85ebca6b); + h ^= h >> 13; + h = h.wrapping_mul(0xc2b2ae35); + h ^= h >> 16; + h +} + +enum KeyLayout { + /// Sequential unique keys + Sequential, + /// Random unique keys + Random, + /// Random keys, but only use the bits from the mask of them + RandomReuse(u32), +} + +enum WriteDelta { + Yes, + No, +} + +async fn ingest( + conf: &'static PageServerConf, + put_size: usize, + put_count: usize, + key_layout: KeyLayout, + write_delta: WriteDelta, +) -> anyhow::Result<()> { + let mut lsn = utils::lsn::Lsn(1000); + let mut key = Key::from_i128(0x0); + + let timeline_id = TimelineId::generate(); + let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?; + + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + + let gate = utils::sync::gate::Gate::default(); + let entered = gate.enter().unwrap(); + + let layer = + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?; + + let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?; + let ctx = RequestContext::new( + pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler, + pageserver::context::DownloadBehavior::Download, + ); + + for i in 0..put_count { + lsn += put_size as u64; + + // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people + // usually care the most about write performance when they're blasting a huge batch of data into a huge table. + match key_layout { + KeyLayout::Sequential => { + // Use sequential order to illustrate the experience a user is likely to have + // when ingesting bulk data. + key.field6 = i as u32; + } + KeyLayout::Random => { + // Use random-order keys to avoid giving a false advantage to data structures that are + // faster when inserting on the end. + key.field6 = murmurhash32(i as u32); + } + KeyLayout::RandomReuse(mask) => { + // Use low bits only, to limit cardinality + key.field6 = murmurhash32(i as u32) & mask; + } + } + + layer.put_value(key, lsn, &data, &ctx).await?; + } + layer.freeze(lsn + 1).await; + + if matches!(write_delta, WriteDelta::Yes) { + let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct { + max_concurrency: NonZeroUsize::new(1).unwrap(), + }); + let (_desc, path) = layer + .write_to_disk(&ctx, None, l0_flush_state.inner()) + .await? + .unwrap(); + tokio::fs::remove_file(path).await?; + } + + Ok(()) +} + +/// Wrapper to instantiate a tokio runtime +fn ingest_main( + conf: &'static PageServerConf, + put_size: usize, + put_count: usize, + key_layout: KeyLayout, + write_delta: WriteDelta, +) { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + runtime.block_on(async move { + let r = ingest(conf, put_size, put_count, key_layout, write_delta).await; + if let Err(e) = r { + panic!("{e:?}"); + } + }); +} + +/// Declare a series of benchmarks for the Pageserver's ingest write path. +/// +/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either +/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set). +/// +/// Genuine disk I/O is used, so expect results to differ depending on storage. However, when running on +/// a fast disk, CPU is the bottleneck at time of writing. +fn criterion_benchmark(c: &mut Criterion) { + let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap(); + let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap(); + eprintln!("Data directory: {}", temp_dir.path()); + + let conf: &'static PageServerConf = Box::leak(Box::new( + pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()), + )); + virtual_file::init(16384, virtual_file::io_engine_for_bench()); + page_cache::init(conf.page_cache_size); + + { + let mut group = c.benchmark_group("ingest-small-values"); + let put_size = 100usize; + let put_count = 128 * 1024 * 1024 / put_size; + group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64)); + group.sample_size(10); + group.bench_function("ingest 128MB/100b seq", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b rand", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Random, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b rand-1024keys", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::RandomReuse(0x3ff), + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b seq, no delta", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::No, + ) + }) + }); + } + + { + let mut group = c.benchmark_group("ingest-big-values"); + let put_size = 8192usize; + let put_count = 128 * 1024 * 1024 / put_size; + group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64)); + group.sample_size(10); + group.bench_function("ingest 128MB/8k seq", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/8k seq, no delta", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::No, + ) + }) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5ebd6511ac..932918410c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -123,6 +123,7 @@ fn main() -> anyhow::Result<()> { // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); + info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); info!(?conf.get_impl, "starting with get page implementation"); info!(?conf.get_vectored_impl, "starting with vectored get page implementation"); info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access"); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 41c2fe0af3..f4c367bd4d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -300,6 +300,9 @@ pub struct PageServerConf { /// This flag is temporary and will be removed after gradual rollout. /// See . pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess, + + /// Direct IO settings + pub virtual_file_direct_io: virtual_file::DirectIoMode, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -408,6 +411,8 @@ struct PageServerConfigBuilder { l0_flush: BuilderValue, compact_level0_phase1_value_access: BuilderValue, + + virtual_file_direct_io: BuilderValue, } impl PageServerConfigBuilder { @@ -498,6 +503,7 @@ impl PageServerConfigBuilder { ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: Set(L0FlushConfig::default()), compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()), + virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()), } } } @@ -685,6 +691,10 @@ impl PageServerConfigBuilder { self.compact_level0_phase1_value_access = BuilderValue::Set(value); } + pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) { + self.virtual_file_direct_io = BuilderValue::Set(value); + } + pub fn build(self, id: NodeId) -> anyhow::Result { let default = Self::default_values(); @@ -743,6 +753,7 @@ impl PageServerConfigBuilder { ephemeral_bytes_per_memory_kb, l0_flush, compact_level0_phase1_value_access, + virtual_file_direct_io, } CUSTOM LOGIC { @@ -1018,6 +1029,9 @@ impl PageServerConf { "compact_level0_phase1_value_access" => { builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?) } + "virtual_file_direct_io" => { + builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?) + } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -1103,6 +1117,7 @@ impl PageServerConf { ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + virtual_file_direct_io: virtual_file::DirectIoMode::default(), } } } @@ -1345,6 +1360,7 @@ background_task_maximum_delay = '334 s' ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + virtual_file_direct_io: virtual_file::DirectIoMode::default(), }, "Correct defaults should be used when no config values are provided" ); @@ -1420,6 +1436,7 @@ background_task_maximum_delay = '334 s' ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + virtual_file_direct_io: virtual_file::DirectIoMode::default(), }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 59e646d0ca..42086dc2e6 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -308,6 +308,45 @@ paths: application/json: schema: type: string + + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Persistently add a gc blocking at the tenant level because of this timeline + responses: + "200": + description: OK + + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Persistently remove a tenant level gc blocking for this timeline + responses: + "200": + description: OK + /v1/tenant/{tenant_shard_id}/location_config: parameters: - name: tenant_shard_id @@ -893,7 +932,7 @@ components: description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything. ArchivalConfigRequest: type: object - required + required: - state properties: state: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 117f2c5869..a983d8c4c2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -935,6 +935,7 @@ async fn tenant_list_handler( generation: (*gen) .into() .expect("Tenants are always attached with a generation"), + gc_blocking: None, }) .collect::>(); @@ -986,6 +987,7 @@ async fn tenant_status( .generation() .into() .expect("Tenants are always attached with a generation"), + gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")), }, walredo: tenant.wal_redo_manager_status(), timelines: tenant.list_timeline_ids(), @@ -1160,7 +1162,10 @@ async fn layer_map_info_handler( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - let layer_map_info = timeline.layer_map_info(reset).await; + let layer_map_info = timeline + .layer_map_info(reset) + .await + .map_err(|_shutdown| ApiError::ShuttingDown)?; json_response(StatusCode::OK, layer_map_info) } @@ -1226,6 +1231,72 @@ async fn evict_timeline_layer_handler( } } +async fn timeline_gc_blocking_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + block_or_unblock_gc(request, true).await +} + +async fn timeline_gc_unblocking_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + block_or_unblock_gc(request, false).await +} + +/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`. +/// +/// Both are technically unsafe because they might fire off index uploads, thus they are POST. +async fn block_or_unblock_gc( + request: Request, + block: bool, +) -> Result, ApiError> { + use crate::tenant::{ + remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized, + }; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let timeline = tenant.get_timeline(timeline_id, true)?; + + let fut = async { + if block { + timeline.block_gc(&tenant).await.map(|_| ()) + } else { + timeline.unblock_gc(&tenant).await + } + }; + + let span = tracing::info_span!( + "block_or_unblock_gc", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + timeline_id = %timeline_id, + block = block, + ); + + let res = fut.instrument(span).await; + + res.map_err(|e| { + if e.is::() || e.is::() { + ApiError::ShuttingDown + } else { + ApiError::InternalServerError(e) + } + })?; + + json_response(StatusCode::OK, ()) +} + /// Get tenant_size SVG graph along with the JSON data. fn synthetic_size_html_response( inputs: ModelInputs, @@ -2904,6 +2975,14 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, evict_timeline_layer_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc", + |r| api_handler(r, timeline_gc_blocking_handler), + ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc", + |r| api_handler(r, timeline_gc_unblocking_handler), + ) .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { api_handler(r, secondary_upload_handler) }) diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index 8945e5accd..10187f2ba3 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -24,7 +24,7 @@ impl Default for L0FlushConfig { #[derive(Clone)] pub struct L0FlushGlobalState(Arc); -pub(crate) enum Inner { +pub enum Inner { PageCached, Direct { semaphore: tokio::sync::Semaphore }, } @@ -40,7 +40,7 @@ impl L0FlushGlobalState { } } - pub(crate) fn inner(&self) -> &Arc { + pub fn inner(&self) -> &Arc { &self.0 } } diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 45a516566f..ede1791afa 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -56,7 +56,6 @@ impl Statvfs { } pub mod mock { - use anyhow::Context; use camino::Utf8Path; use regex::Regex; use tracing::log::info; @@ -135,14 +134,30 @@ pub mod mock { { continue; } - total += entry - .metadata() - .with_context(|| format!("get metadata of {:?}", entry.path()))? - .len(); + let m = match entry.metadata() { + Ok(m) => m, + Err(e) if is_not_found(&e) => { + // some temp file which got removed right as we are walking + continue; + } + Err(e) => { + return Err(anyhow::Error::new(e) + .context(format!("get metadata of {:?}", entry.path()))) + } + }; + total += m.len(); } Ok(total) } + fn is_not_found(e: &walkdir::Error) -> bool { + let Some(io_error) = e.io_error() else { + return false; + }; + let kind = io_error.kind(); + matches!(kind, std::io::ErrorKind::NotFound) + } + pub struct Statvfs { pub blocks: u64, pub blocks_available: u64, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 84c5095610..90c0e28bc4 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -148,6 +148,7 @@ pub(crate) mod timeline; pub mod size; +mod gc_block; pub(crate) mod throttle; pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; @@ -303,6 +304,12 @@ pub struct Tenant { /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. ongoing_timeline_detach: std::sync::Mutex>, + /// `index_part.json` based gc blocking reason tracking. + /// + /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before + /// proceeding. + pub(crate) gc_block: gc_block::GcBlock, + l0_flush_global_state: L0FlushGlobalState, } @@ -594,6 +601,12 @@ impl From for GcError { } } +impl From for GcError { + fn from(_: timeline::layer_manager::Shutdown) -> Self { + GcError::TimelineCancelled + } +} + #[derive(thiserror::Error, Debug)] pub(crate) enum LoadConfigError { #[error("TOML deserialization error: '{0}'")] @@ -703,6 +716,7 @@ impl Tenant { .read() .await .layer_map() + .expect("currently loading, layer manager cannot be shutdown already") .iter_historic_layers() .next() .is_some(), @@ -1036,6 +1050,8 @@ impl Tenant { } } + let mut gc_blocks = HashMap::new(); + // For every timeline, download the metadata file, scan the local directory, // and build a layer map that contains an entry for each remote and local // layer file. @@ -1045,6 +1061,16 @@ impl Tenant { .remove(&timeline_id) .expect("just put it in above"); + if let Some(blocking) = index_part.gc_blocking.as_ref() { + // could just filter these away, but it helps while testing + anyhow::ensure!( + !blocking.reasons.is_empty(), + "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons" + ); + let prev = gc_blocks.insert(timeline_id, blocking.reasons); + assert!(prev.is_none()); + } + // TODO again handle early failure self.load_remote_timeline( timeline_id, @@ -1089,6 +1115,8 @@ impl Tenant { // IndexPart is the source of truth. self.clean_up_timelines(&existent_timelines)?; + self.gc_block.set_scanned(gc_blocks); + fail::fail_point!("attach-before-activate", |_| { anyhow::bail!("attach-before-activate"); }); @@ -1679,6 +1707,14 @@ impl Tenant { } } + let _guard = match self.gc_block.start().await { + Ok(guard) => guard, + Err(reasons) => { + info!("Skipping GC: {reasons}"); + return Ok(GcResult::default()); + } + }; + self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx) .await } @@ -2691,6 +2727,7 @@ impl Tenant { )), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), + gc_block: Default::default(), l0_flush_global_state, } } @@ -2975,54 +3012,6 @@ impl Tenant { // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; - // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they - // depend on. So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here - // and fail out if it's inaccurate. - // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427) - { - let mut all_branchpoints: BTreeMap> = - BTreeMap::new(); - timelines.iter().for_each(|timeline| { - if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() { - let ancestor_children = - all_branchpoints.entry(*ancestor_timeline_id).or_default(); - ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id)); - } - }); - - for timeline in &timelines { - let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints - .remove(&timeline.timeline_id) - .unwrap_or_default(); - - branchpoints.sort_by_key(|b| b.0); - - let target = timeline.gc_info.read().unwrap(); - - // We require that retain_lsns contains everything in `branchpoints`, but not that - // they are exactly equal: timeline deletions can race with us, so retain_lsns - // may contain some extra stuff. It is safe to have extra timelines in there, because it - // just means that we retain slightly more data than we otherwise might. - let have_branchpoints = target.retain_lsns.iter().copied().collect::>(); - for b in &branchpoints { - if !have_branchpoints.contains(b) { - tracing::error!( - "Bug: `retain_lsns` is set incorrectly. Expected be {:?}, but found {:?}", - branchpoints, - target.retain_lsns - ); - debug_assert!(false); - // Do not GC based on bad information! - // (ab-use an existing GcError type rather than adding a new one, since this is a - // "should never happen" check that will be removed soon). - return Err(GcError::Remote(anyhow::anyhow!( - "retain_lsns failed validation!" - ))); - } - } - } - } - // Ok, we now know all the branch points. // Update the GC information for each timeline. let mut gc_timelines = Vec::with_capacity(timelines.len()); @@ -4092,7 +4081,7 @@ pub(crate) mod harness { #[cfg(test)] mod tests { - use std::collections::BTreeMap; + use std::collections::{BTreeMap, BTreeSet}; use super::*; use crate::keyspace::KeySpaceAccum; @@ -4644,10 +4633,10 @@ mod tests { let layer_map = tline.layers.read().await; let level0_deltas = layer_map - .layer_map() - .get_level0_deltas() - .into_iter() - .map(|desc| layer_map.get_from_desc(&desc)) + .layer_map()? + .level0_deltas() + .iter() + .map(|desc| layer_map.get_from_desc(desc)) .collect::>(); assert!(!level0_deltas.is_empty()); @@ -4767,7 +4756,7 @@ mod tests { lsn: Lsn, repeat: usize, key_count: usize, - ) -> anyhow::Result<()> { + ) -> anyhow::Result>> { let compact = true; bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await } @@ -4780,7 +4769,9 @@ mod tests { repeat: usize, key_count: usize, compact: bool, - ) -> anyhow::Result<()> { + ) -> anyhow::Result>> { + let mut inserted: HashMap> = Default::default(); + let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let mut blknum = 0; @@ -4801,6 +4792,7 @@ mod tests { ctx, ) .await?; + inserted.entry(test_key).or_default().insert(lsn); writer.finish_write(lsn); drop(writer); @@ -4825,7 +4817,7 @@ mod tests { assert_eq!(res.layers_removed, 0, "this never removes anything"); } - Ok(()) + Ok(inserted) } // @@ -4872,14 +4864,16 @@ mod tests { .await?; let lsn = Lsn(0x10); - bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; + let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; let guard = tline.layers.read().await; - guard.layer_map().dump(true, &ctx).await?; + let lm = guard.layer_map()?; + + lm.dump(true, &ctx).await?; let mut reads = Vec::new(); let mut prev = None; - guard.layer_map().iter_historic_layers().for_each(|desc| { + lm.iter_historic_layers().for_each(|desc| { if !desc.is_delta() { prev = Some(desc.clone()); return; @@ -4933,9 +4927,39 @@ mod tests { &ctx, ) .await; - tline - .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx) - .await; + + let mut expected_lsns: HashMap = Default::default(); + let mut expect_missing = false; + let mut key = read.start().unwrap(); + while key != read.end().unwrap() { + if let Some(lsns) = inserted.get(&key) { + let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn); + match expected_lsn { + Some(lsn) => { + expected_lsns.insert(key, *lsn); + } + None => { + expect_missing = true; + break; + } + } + } else { + expect_missing = true; + break; + } + + key = key.next(); + } + + if expect_missing { + assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_)))); + } else { + for (key, image) in vectored_res? { + let expected_lsn = expected_lsns.get(&key).expect("determined above"); + let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn)); + assert_eq!(image?, expected_image); + } + } } Ok(()) @@ -4985,10 +5009,6 @@ mod tests { ) .await; - child_timeline - .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx) - .await; - let images = vectored_res?; assert!(images.is_empty()); Ok(()) @@ -5859,23 +5879,12 @@ mod tests { tline.freeze_and_flush().await?; // force create a delta layer } - let before_num_l0_delta_files = tline - .layers - .read() - .await - .layer_map() - .get_level0_deltas() - .len(); + let before_num_l0_delta_files = + tline.layers.read().await.layer_map()?.level0_deltas().len(); tline.compact(&cancel, EnumSet::empty(), &ctx).await?; - let after_num_l0_delta_files = tline - .layers - .read() - .await - .layer_map() - .get_level0_deltas() - .len(); + let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); @@ -6899,7 +6908,10 @@ mod tests { } let cancel = CancellationToken::new(); - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); for (idx, expected) in expected_result.iter().enumerate() { assert_eq!( @@ -6993,7 +7005,10 @@ mod tests { guard.cutoffs.time = Lsn(0x40); guard.cutoffs.space = Lsn(0x40); } - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); Ok(()) } @@ -7327,7 +7342,10 @@ mod tests { } let cancel = CancellationToken::new(); - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); for idx in 0..10 { assert_eq!( @@ -7353,7 +7371,10 @@ mod tests { guard.cutoffs.time = Lsn(0x40); guard.cutoffs.space = Lsn(0x40); } - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); Ok(()) } @@ -7898,11 +7919,28 @@ mod tests { verify_result().await; let cancel = CancellationToken::new(); - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + let mut dryrun_flags = EnumSet::new(); + dryrun_flags.insert(CompactFlags::DryRun); + + tline + .compact_with_gc(&cancel, dryrun_flags, &ctx) + .await + .unwrap(); + // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs + // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests. + verify_result().await; + + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); verify_result().await; // compact again - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); verify_result().await; // increase GC horizon and compact again @@ -7912,11 +7950,17 @@ mod tests { guard.cutoffs.time = Lsn(0x38); guard.cutoffs.space = Lsn(0x38); } - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result // not increasing the GC horizon and compact again - tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); verify_result().await; Ok(()) @@ -8097,7 +8141,10 @@ mod tests { verify_result().await; let cancel = CancellationToken::new(); - branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + branch_tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); verify_result().await; diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index bb65ae24fc..770f3ca5f0 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -29,6 +29,7 @@ impl EphemeralFile { conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + gate_guard: utils::sync::gate::GateGuard, ctx: &RequestContext, ) -> Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); @@ -51,10 +52,12 @@ impl EphemeralFile { ) .await?; + let prewarm = conf.l0_flush.prewarm_on_write(); + Ok(EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()), + rw: page_caching::RW::new(file, prewarm, gate_guard), }) } @@ -161,7 +164,11 @@ mod tests { async fn test_ephemeral_blobs() -> Result<(), io::Error> { let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?; + let gate = utils::sync::gate::Gate::default(); + + let entered = gate.enter().unwrap(); + + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?; let pos_foo = file.write_blob(b"foo", &ctx).await?; assert_eq!( @@ -215,4 +222,38 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn ephemeral_file_holds_gate_open() { + const FOREVER: std::time::Duration = std::time::Duration::from_secs(5); + + let (conf, tenant_id, timeline_id, ctx) = + harness("ephemeral_file_holds_gate_open").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let mut closing = tokio::task::spawn(async move { + gate.close().await; + }); + + // gate is entered until the ephemeral file is dropped + // do not start paused tokio-epoll-uring has a sleep loop + tokio::time::pause(); + tokio::time::timeout(FOREVER, &mut closing) + .await + .expect_err("closing cannot complete before dropping"); + + // this is a requirement of the reset_tenant functionality: we have to be able to restart a + // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate + drop(file); + + tokio::time::timeout(FOREVER, &mut closing) + .await + .expect("closing completes right away") + .expect("closing does not panic"); + } } diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 43b9fff28d..0a12b64a7c 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -18,6 +18,8 @@ use super::zero_padded_read_write; pub struct RW { page_cache_file_id: page_cache::FileId, rw: super::zero_padded_read_write::RW, + /// Gate guard is held on as long as we need to do operations in the path (delete on drop). + _gate_guard: utils::sync::gate::GateGuard, } /// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`], @@ -29,7 +31,11 @@ pub enum PrewarmOnWrite { } impl RW { - pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self { + pub fn new( + file: VirtualFile, + prewarm_on_write: PrewarmOnWrite, + _gate_guard: utils::sync::gate::GateGuard, + ) -> Self { let page_cache_file_id = page_cache::next_file_id(); Self { page_cache_file_id, @@ -38,6 +44,7 @@ impl RW { file, prewarm_on_write, )), + _gate_guard, } } @@ -145,6 +152,7 @@ impl Drop for RW { // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. // unlink the file + // we are clear to do this, because we have entered a gate let res = std::fs::remove_file(&self.rw.as_writer().file.path); if let Err(e) = res { if e.kind() != std::io::ErrorKind::NotFound { diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs new file mode 100644 index 0000000000..8b41ba1746 --- /dev/null +++ b/pageserver/src/tenant/gc_block.rs @@ -0,0 +1,213 @@ +use std::collections::HashMap; + +use utils::id::TimelineId; + +use super::remote_timeline_client::index::GcBlockingReason; + +type Storage = HashMap>; + +#[derive(Default)] +pub(crate) struct GcBlock { + /// The timelines which have current reasons to block gc. + /// + /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done + /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`. + reasons: std::sync::Mutex, + blocking: tokio::sync::Mutex<()>, +} + +impl GcBlock { + /// Start another gc iteration. + /// + /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with + /// it's ending, or if not currently possible, a value describing the reasons why not. + /// + /// Cancellation safe. + pub(super) async fn start(&self) -> Result, BlockingReasons> { + let reasons = { + let g = self.reasons.lock().unwrap(); + + // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in + // tests, we use everything. we should warn if the gc has been consecutively blocked + // for more than 1h (within single tenant session?). + BlockingReasons::clean_and_summarize(g) + }; + + if let Some(reasons) = reasons { + Err(reasons) + } else { + Ok(Guard { + _inner: self.blocking.lock().await, + }) + } + } + + pub(crate) fn summary(&self) -> Option { + let g = self.reasons.lock().unwrap(); + + BlockingReasons::summarize(&g) + } + + /// Start blocking gc for this one timeline for the given reason. + /// + /// This is not a guard based API but instead it mimics set API. The returned future will not + /// resolve until an existing gc round has completed. + /// + /// Returns true if this block was new, false if gc was already blocked for this reason. + /// + /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will + /// keep the gc blocking reason. + pub(crate) async fn insert( + &self, + timeline: &super::Timeline, + reason: GcBlockingReason, + ) -> anyhow::Result { + let (added, uploaded) = { + let mut g = self.reasons.lock().unwrap(); + let set = g.entry(timeline.timeline_id).or_default(); + let added = set.insert(reason); + + // LOCK ORDER: intentionally hold the lock, see self.reasons. + let uploaded = timeline + .remote_client + .schedule_insert_gc_block_reason(reason)?; + + (added, uploaded) + }; + + uploaded.await?; + + // ensure that any ongoing gc iteration has completed + drop(self.blocking.lock().await); + + Ok(added) + } + + /// Remove blocking gc for this one timeline and the given reason. + pub(crate) async fn remove( + &self, + timeline: &super::Timeline, + reason: GcBlockingReason, + ) -> anyhow::Result<()> { + use std::collections::hash_map::Entry; + + super::span::debug_assert_current_span_has_tenant_and_timeline_id(); + + let (remaining_blocks, uploaded) = { + let mut g = self.reasons.lock().unwrap(); + match g.entry(timeline.timeline_id) { + Entry::Occupied(mut oe) => { + let set = oe.get_mut(); + set.remove(reason); + if set.is_empty() { + oe.remove(); + } + } + Entry::Vacant(_) => { + // we must still do the index_part.json update regardless, in case we had earlier + // been cancelled + } + } + + let remaining_blocks = g.len(); + + // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons + let uploaded = timeline + .remote_client + .schedule_remove_gc_block_reason(reason)?; + + (remaining_blocks, uploaded) + }; + uploaded.await?; + + // no need to synchronize with gc iteration again + + if remaining_blocks > 0 { + tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked"); + } else { + tracing::info!("gc is now unblocked for the tenant"); + } + + Ok(()) + } + + pub(crate) fn before_delete(&self, timeline: &super::Timeline) { + let unblocked = { + let mut g = self.reasons.lock().unwrap(); + if g.is_empty() { + return; + } + + g.remove(&timeline.timeline_id); + + BlockingReasons::clean_and_summarize(g).is_none() + }; + + if unblocked { + tracing::info!("gc is now unblocked following deletion"); + } + } + + /// Initialize with the non-deleted timelines of this tenant. + pub(crate) fn set_scanned(&self, scanned: Storage) { + let mut g = self.reasons.lock().unwrap(); + assert!(g.is_empty()); + g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty())); + + if let Some(reasons) = BlockingReasons::clean_and_summarize(g) { + tracing::info!(summary=?reasons, "initialized with gc blocked"); + } + } +} + +pub(super) struct Guard<'a> { + _inner: tokio::sync::MutexGuard<'a, ()>, +} + +#[derive(Debug)] +pub(crate) struct BlockingReasons { + timelines: usize, + reasons: enumset::EnumSet, +} + +impl std::fmt::Display for BlockingReasons { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{} timelines block for {:?}", + self.timelines, self.reasons + ) + } +} + +impl BlockingReasons { + fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option { + let mut reasons = enumset::EnumSet::empty(); + g.retain(|_key, value| { + reasons = reasons.union(*value); + !value.is_empty() + }); + if !g.is_empty() { + Some(BlockingReasons { + timelines: g.len(), + reasons, + }) + } else { + None + } + } + + fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option { + if g.is_empty() { + None + } else { + let reasons = g + .values() + .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next)); + Some(BlockingReasons { + timelines: g.len(), + reasons, + }) + } + } +} diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index ba9c08f6e7..844f117ea2 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -846,8 +846,8 @@ impl LayerMap { } /// Return all L0 delta layers - pub fn get_level0_deltas(&self) -> Vec> { - self.l0_delta_layers.to_vec() + pub fn level0_deltas(&self) -> &Vec> { + &self.l0_delta_layers } /// debugging function to print out the contents of the layer map diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index b5568d37b5..3316627540 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; use std::borrow::Cow; use std::cmp::Ordering; -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::Deref; use std::sync::Arc; use std::time::Duration; @@ -224,21 +224,8 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result>); -enum BackgroundPurgesInner { - Open(tokio::task::JoinSet<()>), - // we use the async mutex for coalescing - ShuttingDown(Arc>>), -} - -impl Default for BackgroundPurges { - fn default() -> Self { - Self(Arc::new(std::sync::Mutex::new( - BackgroundPurgesInner::Open(JoinSet::new()), - ))) - } -} +#[derive(Clone, Default)] +pub struct BackgroundPurges(tokio_util::task::TaskTracker); impl BackgroundPurges { /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in @@ -247,24 +234,32 @@ impl BackgroundPurges { /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. /// Thus the [`BackgroundPurges`] type to keep track of these tasks. pub fn spawn(&self, tmp_path: Utf8PathBuf) { - let mut guard = self.0.lock().unwrap(); - let jset = match &mut *guard { - BackgroundPurgesInner::Open(ref mut jset) => jset, - BackgroundPurgesInner::ShuttingDown(_) => { - warn!("trying to spawn background purge during shutdown, ignoring"); - return; + // because on shutdown we close and wait, we are misusing TaskTracker a bit. + // + // so first acquire a token, then check if the tracker has been closed. the tracker might get closed + // right after, but at least the shutdown will wait for what we are spawning next. + let token = self.0.token(); + + if self.0.is_closed() { + warn!( + %tmp_path, + "trying to spawn background purge during shutdown, ignoring" + ); + return; + } + + let span = info_span!(parent: None, "background_purge", %tmp_path); + + let task = move || { + let _token = token; + let _entered = span.entered(); + if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) { + // should we fatal_io_error here? + warn!(%error, "failed to purge tenant directory"); } }; - jset.spawn_on( - async move { - if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await { - // should we fatal_io_error here? - warn!(%error, path=%tmp_path, "failed to purge tenant directory"); - } - } - .instrument(info_span!(parent: None, "background_purge")), - BACKGROUND_RUNTIME.handle(), - ); + + BACKGROUND_RUNTIME.spawn_blocking(task); } /// When this future completes, all background purges have completed. @@ -278,42 +273,9 @@ impl BackgroundPurges { /// instances of this future will continue to be correct. #[instrument(skip_all)] pub async fn shutdown(&self) { - let jset = { - let mut guard = self.0.lock().unwrap(); - match &mut *guard { - BackgroundPurgesInner::Open(jset) => { - *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new( - std::mem::take(jset), - ))) - } - BackgroundPurgesInner::ShuttingDown(_) => { - // calling shutdown multiple times is most likely a bug in pageserver shutdown code - warn!("already shutting down"); - } - }; - match &mut *guard { - BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(), - BackgroundPurgesInner::Open(_) => { - unreachable!("above code transitions into shut down state"); - } - } - }; - let mut jset = jset.lock().await; // concurrent callers coalesce here - while let Some(res) = jset.join_next().await { - match res { - Ok(()) => {} - Err(e) if e.is_panic() => { - // If it panicked, the error is already logged by the panic hook. - } - Err(e) if e.is_cancelled() => { - unreachable!("we don't cancel the joinset or runtime") - } - Err(e) => { - // No idea when this can happen, but let's log it. - warn!(%e, "background purge task failed or panicked"); - } - } - } + // forbid new tasks (can be called many times) + self.0.close(); + self.0.wait().await; } } @@ -1767,14 +1729,9 @@ impl TenantManager { let parent_timelines = timelines.keys().cloned().collect::>(); for timeline in timelines.values() { tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink"); - let timeline_layers = timeline - .layers - .read() - .await - .likely_resident_layers() - .collect::>(); + let layers = timeline.layers.read().await; - for layer in timeline_layers { + for layer in layers.likely_resident_layers() { let relative_path = layer .local_path() .strip_prefix(&parent_path) @@ -1971,7 +1928,8 @@ impl TenantManager { timeline_id: TimelineId, prepared: PreparedTimelineDetach, ctx: &RequestContext, - ) -> Result, anyhow::Error> { + ) -> Result, anyhow::Error> { + // FIXME: this is unnecessary, slotguard already has these semantics struct RevertOnDropSlot(Option); impl Drop for RevertOnDropSlot { diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 9e021c7e35..1344fe4192 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -800,6 +800,123 @@ impl RemoteTimelineClient { .context("wait completion") } + /// Adds a gc blocking reason for this timeline if one does not exist already. + /// + /// A retryable step of timeline detach ancestor. + /// + /// Returns a future which waits until the completion of the upload. + pub(crate) fn schedule_insert_gc_block_reason( + self: &Arc, + reason: index::GcBlockingReason, + ) -> Result>, NotInitialized> + { + let maybe_barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if let index::GcBlockingReason::DetachAncestor = reason { + if upload_queue.dirty.metadata.ancestor_timeline().is_none() { + drop(guard); + panic!("cannot start detach ancestor if there is nothing to detach from"); + } + } + + let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason)); + + let current = upload_queue.dirty.gc_blocking.as_ref(); + let uploaded = upload_queue.clean.0.gc_blocking.as_ref(); + + match (current, uploaded) { + (x, y) if wanted(x) && wanted(y) => None, + (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), + // Usual case: !wanted(x) && !wanted(y) + // + // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to + // turn on and off some reason. + (x, y) => { + if !wanted(x) && wanted(y) { + // this could be avoided by having external in-memory synchronization, like + // timeline detach ancestor + warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason"); + } + + // at this point, the metadata must always show that there is a parent + upload_queue.dirty.gc_blocking = current + .map(|x| x.with_reason(reason)) + .or_else(|| Some(index::GcBlocking::started_now_for(reason))); + self.schedule_index_upload(upload_queue)?; + Some(self.schedule_barrier0(upload_queue)) + } + } + }; + + Ok(async move { + if let Some(barrier) = maybe_barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) + }) + } + + /// Removes a gc blocking reason for this timeline if one exists. + /// + /// A retryable step of timeline detach ancestor. + /// + /// Returns a future which waits until the completion of the upload. + pub(crate) fn schedule_remove_gc_block_reason( + self: &Arc, + reason: index::GcBlockingReason, + ) -> Result>, NotInitialized> + { + let maybe_barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if let index::GcBlockingReason::DetachAncestor = reason { + if !upload_queue + .clean + .0 + .lineage + .is_detached_from_original_ancestor() + { + drop(guard); + panic!("cannot complete timeline_ancestor_detach while not detached"); + } + } + + let wanted = |x: Option<&index::GcBlocking>| { + x.is_none() || x.is_some_and(|b| !b.blocked_by(reason)) + }; + + let current = upload_queue.dirty.gc_blocking.as_ref(); + let uploaded = upload_queue.clean.0.gc_blocking.as_ref(); + + match (current, uploaded) { + (x, y) if wanted(x) && wanted(y) => None, + (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), + (x, y) => { + if !wanted(x) && wanted(y) { + warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)"); + } + + upload_queue.dirty.gc_blocking = + current.as_ref().and_then(|x| x.without_reason(reason)); + assert!(wanted(upload_queue.dirty.gc_blocking.as_ref())); + // FIXME: bogus ? + self.schedule_index_upload(upload_queue)?; + Some(self.schedule_barrier0(upload_queue)) + } + } + }; + + Ok(async move { + if let Some(barrier) = maybe_barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) + }) + } + /// Launch an upload operation in the background; the file is added to be included in next /// `index_part.json` upload. pub(crate) fn schedule_layer_file_upload( diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 3075df022e..90453b1922 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -60,6 +60,9 @@ pub struct IndexPart { #[serde(default)] pub(crate) lineage: Lineage, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) gc_blocking: Option, + /// Describes the kind of aux files stored in the timeline. /// /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable. @@ -85,10 +88,11 @@ impl IndexPart { /// - 6: last_aux_file_policy is added. /// - 7: metadata_bytes is no longer written, but still read /// - 8: added `archived_at` - const LATEST_VERSION: usize = 8; + /// - 9: +gc_blocking + const LATEST_VERSION: usize = 9; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -101,6 +105,7 @@ impl IndexPart { deleted_at: None, archived_at: None, lineage: Default::default(), + gc_blocking: None, last_aux_file_policy: None, } } @@ -251,6 +256,64 @@ impl Lineage { } } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) struct GcBlocking { + pub(crate) started_at: NaiveDateTime, + pub(crate) reasons: enumset::EnumSet, +} + +#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)] +#[enumset(serialize_repr = "list")] +pub(crate) enum GcBlockingReason { + Manual, + DetachAncestor, +} + +impl GcBlocking { + pub(super) fn started_now_for(reason: GcBlockingReason) -> Self { + GcBlocking { + started_at: chrono::Utc::now().naive_utc(), + reasons: enumset::EnumSet::only(reason), + } + } + + /// Returns true if the given reason is one of the reasons why the gc is blocked. + pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool { + self.reasons.contains(reason) + } + + /// Returns a version of self with the given reason. + pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self { + assert!(!self.blocked_by(reason)); + let mut reasons = self.reasons; + reasons.insert(reason); + + Self { + started_at: self.started_at, + reasons, + } + } + + /// Returns a version of self without the given reason. Assumption is that if + /// there are no more reasons, we can unblock the gc by returning `None`. + pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option { + assert!(self.blocked_by(reason)); + + if self.reasons.len() == 1 { + None + } else { + let mut reasons = self.reasons; + assert!(reasons.remove(reason)); + assert!(!reasons.is_empty()); + + Some(Self { + started_at: self.started_at, + reasons, + }) + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -292,6 +355,7 @@ mod tests { deleted_at: None, archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -335,6 +399,7 @@ mod tests { deleted_at: None, archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -379,6 +444,7 @@ mod tests { deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -426,6 +492,7 @@ mod tests { deleted_at: None, archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -468,6 +535,7 @@ mod tests { deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: None, lineage: Lineage::default(), + gc_blocking: None, last_aux_file_policy: None, }; @@ -513,6 +581,7 @@ mod tests { reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), }, + gc_blocking: None, last_aux_file_policy: None, }; @@ -563,6 +632,7 @@ mod tests { reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), }, + gc_blocking: None, last_aux_file_policy: Some(AuxFilePolicy::V2), }; @@ -618,6 +688,7 @@ mod tests { deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: None, lineage: Default::default(), + gc_blocking: None, last_aux_file_policy: Default::default(), }; @@ -674,6 +745,7 @@ mod tests { deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")), lineage: Default::default(), + gc_blocking: None, last_aux_file_policy: Default::default(), }; @@ -681,6 +753,68 @@ mod tests { assert_eq!(part, expected); } + #[test] + fn v9_indexpart_is_parsed() { + let example = r#"{ + "version": 9, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + } + }"#; + + let expected = IndexPart { + version: 9, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + fn parse_naive_datetime(s: &str) -> NaiveDateTime { chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 27439d4f03..135e73b57f 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -55,7 +55,7 @@ use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; use utils::{ backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, - id::TimelineId, serde_system_time, + id::TimelineId, pausable_failpoint, serde_system_time, }; use super::{ @@ -1146,12 +1146,14 @@ impl<'a> TenantDownloader<'a> { layer: HeatMapLayer, ctx: &RequestContext, ) -> Result, UpdateError> { - // Failpoint for simulating slow remote storage + // Failpoints for simulating slow remote storage failpoint_support::sleep_millis_async!( "secondary-layer-download-sleep", &self.secondary_state.cancel ); + pausable_failpoint!("secondary-layer-download-pausable"); + let local_path = local_layer_path( self.conf, tenant_shard_id, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 59d3e1ce09..04f89db401 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -435,21 +435,6 @@ impl ReadableLayer { } } -/// Return value from [`Layer::get_value_reconstruct_data`] -#[derive(Clone, Copy, Debug)] -pub enum ValueReconstructResult { - /// Got all the data needed to reconstruct the requested page - Complete, - /// This layer didn't contain all the required data, the caller should look up - /// the predecessor layer at the returned LSN and collect more data from there. - Continue, - - /// This layer didn't contain data needed to reconstruct the page version at - /// the returned LSN. This is usually considered an error, but might be OK - /// in some circumstances. - Missing, -} - /// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility /// of layers (for example when creating a branch that makes some previously covered layers visible). It should @@ -554,19 +539,25 @@ impl LayerAccessStats { self.record_residence_event_at(SystemTime::now()) } - pub(crate) fn record_access_at(&self, now: SystemTime) { + fn record_access_at(&self, now: SystemTime) -> bool { let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now); // A layer which is accessed must be visible. mask |= 0x1 << Self::VISIBILITY_SHIFT; value |= 0x1 << Self::VISIBILITY_SHIFT; - self.write_bits(mask, value); + let old_bits = self.write_bits(mask, value); + !matches!( + self.decode_visibility(old_bits), + LayerVisibilityHint::Visible + ) } - pub(crate) fn record_access(&self, ctx: &RequestContext) { + /// Returns true if we modified the layer's visibility to set it to Visible implicitly + /// as a result of this access + pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool { if ctx.access_stats_behavior() == AccessStatsBehavior::Skip { - return; + return false; } self.record_access_at(SystemTime::now()) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index f9becf53ff..f4e965b99a 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -36,13 +36,12 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi use crate::tenant::disk_btree::{ DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, }; -use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; -use crate::tenant::{PageReconstructError, Timeline}; +use crate::tenant::PageReconstructError; use crate::virtual_file::{self, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; @@ -72,10 +71,7 @@ use utils::{ lsn::Lsn, }; -use super::{ - AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer, - ValuesReconstructState, -}; +use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; /// /// Header stored in the beginning of the file @@ -200,7 +196,6 @@ impl DeltaKey { pub struct DeltaLayer { path: Utf8PathBuf, pub desc: PersistentLayerDesc, - access_stats: LayerAccessStats, inner: OnceCell>, } @@ -299,7 +294,6 @@ impl DeltaLayer { /// not loaded already. /// async fn load(&self, ctx: &RequestContext) -> Result<&Arc> { - self.access_stats.record_access(ctx); // Quick exit if already loaded self.inner .get_or_try_init(|| self.load_inner(ctx)) @@ -350,7 +344,6 @@ impl DeltaLayer { summary.lsn_range, metadata.len(), ), - access_stats: Default::default(), inner: OnceCell::new(), }) } @@ -373,7 +366,6 @@ impl DeltaLayer { /// 3. Call `finish`. /// struct DeltaLayerWriterInner { - conf: &'static PageServerConf, pub path: Utf8PathBuf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -384,6 +376,9 @@ struct DeltaLayerWriterInner { tree: DiskBtreeBuilder, blob_writer: BlobWriter, + + // Number of key-lsns in the layer. + num_keys: usize, } impl DeltaLayerWriterInner { @@ -417,7 +412,6 @@ impl DeltaLayerWriterInner { let tree_builder = DiskBtreeBuilder::new(block_buf); Ok(Self { - conf, path, timeline_id, tenant_shard_id, @@ -425,6 +419,7 @@ impl DeltaLayerWriterInner { lsn_range, tree: tree_builder, blob_writer, + num_keys: 0, }) } @@ -475,6 +470,9 @@ impl DeltaLayerWriterInner { let delta_key = DeltaKey::from_key_lsn(&key, lsn); let res = self.tree.append(&delta_key.0, blob_ref.0); + + self.num_keys += 1; + (val, res.map_err(|e| anyhow::anyhow!(e))) } @@ -488,11 +486,10 @@ impl DeltaLayerWriterInner { async fn finish( self, key_end: Key, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let temp_path = self.path.clone(); - let result = self.finish0(key_end, timeline, ctx).await; + let result = self.finish0(key_end, ctx).await; if result.is_err() { tracing::info!(%temp_path, "cleaning up temporary file after error during writing"); if let Err(e) = std::fs::remove_file(&temp_path) { @@ -505,9 +502,8 @@ impl DeltaLayerWriterInner { async fn finish0( self, key_end: Key, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -572,11 +568,9 @@ impl DeltaLayerWriterInner { // fsync the file file.sync_all().await?; - let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?; + trace!("created delta layer {}", self.path); - trace!("created delta layer {}", layer.local_path()); - - Ok(layer) + Ok((desc, self.path)) } } @@ -677,14 +671,20 @@ impl DeltaLayerWriter { pub(crate) async fn finish( mut self, key_end: Key, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { - self.inner - .take() - .unwrap() - .finish(key_end, timeline, ctx) - .await + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + self.inner.take().unwrap().finish(key_end, ctx).await + } + + #[cfg(test)] + pub(crate) fn num_keys(&self) -> usize { + self.inner.as_ref().unwrap().num_keys + } + + #[cfg(test)] + pub(crate) fn estimated_size(&self) -> u64 { + let inner = self.inner.as_ref().unwrap(); + inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64 } } @@ -808,95 +808,6 @@ impl DeltaLayerInner { }) } - pub(super) async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - let mut need_image = true; - // Scan the page versions backwards, starting from `lsn`. - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - self.index_start_blk, - self.index_root_blk, - &block_reader, - ); - let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); - - let mut offsets: Vec<(Lsn, u64)> = Vec::new(); - - tree_reader - .visit( - &search_key.0, - VisitDirection::Backwards, - |key, value| { - let blob_ref = BlobRef(value); - if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { - return false; - } - let entry_lsn = DeltaKey::extract_lsn_from_buf(key); - if entry_lsn < lsn_range.start { - return false; - } - offsets.push((entry_lsn, blob_ref.pos())); - - !blob_ref.will_init() - }, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::DeltaLayerBtreeNode) - .build(), - ) - .await?; - - let ctx = &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::DeltaLayerValue) - .build(); - - // Ok, 'offsets' now contains the offsets of all the entries we need to read - let cursor = block_reader.block_cursor(); - let mut buf = Vec::new(); - for (entry_lsn, pos) in offsets { - cursor - .read_blob_into_buf(pos, &mut buf, ctx) - .await - .with_context(|| { - format!("Failed to read blob from virtual file {}", self.file.path) - })?; - let val = Value::des(&buf).with_context(|| { - format!( - "Failed to deserialize file blob from virtual file {}", - self.file.path - ) - })?; - match val { - Value::Image(img) => { - reconstruct_state.img = Some((entry_lsn, img)); - need_image = false; - break; - } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } - } - } - - // If an older page image is needed to reconstruct the page, let the - // caller know. - if need_image { - Ok(ValueReconstructResult::Continue) - } else { - Ok(ValueReconstructResult::Complete) - } - } - // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. // @@ -1669,8 +1580,9 @@ pub(crate) mod test { use super::*; use crate::repository::Value; use crate::tenant::harness::TIMELINE_ID; + use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; - use crate::tenant::Tenant; + use crate::tenant::{Tenant, Timeline}; use crate::{ context::DownloadBehavior, task_mgr::TaskKind, @@ -1964,9 +1876,8 @@ pub(crate) mod test { res?; } - let resident = writer - .finish(entries_meta.key_range.end, &timeline, &ctx) - .await?; + let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?; + let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?; let inner = resident.get_as_delta(&ctx).await?; @@ -2046,6 +1957,7 @@ pub(crate) mod test { .await .likely_resident_layers() .next() + .cloned() .unwrap(); { @@ -2120,7 +2032,8 @@ pub(crate) mod test { .read() .await .likely_resident_layers() - .find(|x| x != &initdb_layer) + .find(|&x| x != &initdb_layer) + .cloned() .unwrap(); // create a copy for the timeline, so we don't overwrite the file @@ -2155,7 +2068,8 @@ pub(crate) mod test { .await .unwrap(); - let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap(); + let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap(); + let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap(); copied_layer.get_as_delta(ctx).await.unwrap(); @@ -2283,7 +2197,9 @@ pub(crate) mod test { for (key, lsn, value) in deltas { writer.put_value(key, lsn, value, ctx).await?; } - let delta_layer = writer.finish(key_end, tline, ctx).await?; + + let (desc, path) = writer.finish(key_end, ctx).await?; + let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?; Ok::<_, anyhow::Error>(delta_layer) } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index aa308ba3c1..16ba0fda94 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -32,9 +32,6 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{ DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, }; -use crate::tenant::storage_layer::{ - LayerAccessStats, ValueReconstructResult, ValueReconstructState, -}; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, @@ -137,7 +134,6 @@ pub struct ImageLayer { pub desc: PersistentLayerDesc, // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn pub lsn: Lsn, - access_stats: LayerAccessStats, inner: OnceCell, } @@ -255,7 +251,6 @@ impl ImageLayer { /// not loaded already. /// async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> { - self.access_stats.record_access(ctx); self.inner .get_or_try_init(|| self.load_inner(ctx)) .await @@ -306,7 +301,6 @@ impl ImageLayer { metadata.len(), ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: summary.lsn, - access_stats: Default::default(), inner: OnceCell::new(), }) } @@ -429,46 +423,6 @@ impl ImageLayerInner { }) } - pub(super) async fn get_value_reconstruct_data( - &self, - key: Key, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let tree_reader = - DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); - - let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; - key.write_to_byte_slice(&mut keybuf); - if let Some(offset) = tree_reader - .get( - &keybuf, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::ImageLayerBtreeNode) - .build(), - ) - .await? - { - let blob = block_reader - .block_cursor() - .read_blob( - offset, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::ImageLayerValue) - .build(), - ) - .await - .with_context(|| format!("failed to read value from offset {}", offset))?; - let value = Bytes::from(blob); - - reconstruct_state.img = Some((self.lsn, value)); - Ok(ValueReconstructResult::Complete) - } else { - Ok(ValueReconstructResult::Missing) - } - } - // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. pub(super) async fn get_values_reconstruct_data( @@ -753,6 +707,10 @@ struct ImageLayerWriterInner { } impl ImageLayerWriterInner { + fn size(&self) -> u64 { + self.tree.borrow_writer().size() + self.blob_writer.size() + } + /// /// Start building a new image layer. /// @@ -1044,6 +1002,10 @@ impl ImageLayerWriter { .finish(timeline, ctx, Some(end_key)) .await } + + pub(crate) fn size(&self) -> u64 { + self.inner.as_ref().unwrap().size() + } } impl Drop for ImageLayerWriter { diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index f9010ae8a6..57d93feaaf 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -10,11 +10,11 @@ use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::ephemeral_file::EphemeralFile; -use crate::tenant::storage_layer::ValueReconstructResult; use crate::tenant::timeline::GetVectoredError; -use crate::tenant::{PageReconstructError, Timeline}; +use crate::tenant::PageReconstructError; use crate::{l0_flush, page_cache, walrecord}; -use anyhow::{anyhow, ensure, Result}; +use anyhow::{anyhow, Result}; +use camino::Utf8PathBuf; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; @@ -34,8 +34,7 @@ use std::sync::atomic::{AtomicU64, AtomicUsize}; use tokio::sync::{RwLock, RwLockWriteGuard}; use super::{ - DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState, - ValuesReconstructState, + DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState, }; #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] @@ -55,9 +54,6 @@ pub struct InMemoryLayer { /// Writes are only allowed when this is `None`. pub(crate) end_lsn: OnceLock, - /// Used for traversal path. Cached representation of the in-memory layer before frozen. - local_path_str: Arc, - /// Used for traversal path. Cached representation of the in-memory layer after frozen. frozen_local_path_str: OnceLock>, @@ -248,12 +244,6 @@ impl InMemoryLayer { self.start_lsn..self.end_lsn_or_max() } - pub(crate) fn local_path_str(&self) -> &Arc { - self.frozen_local_path_str - .get() - .unwrap_or(&self.local_path_str) - } - /// debugging function to print out the contents of the layer /// /// this is likely completly unused @@ -303,60 +293,6 @@ impl InMemoryLayer { Ok(()) } - /// Look up given value in the layer. - pub(crate) async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - ensure!(lsn_range.start >= self.start_lsn); - let mut need_image = true; - - let ctx = RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::InMemoryLayer) - .build(); - - let inner = self.inner.read().await; - - let reader = inner.file.block_cursor(); - - // Scan the page versions backwards, starting from `lsn`. - if let Some(vec_map) = inner.index.get(&key) { - let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, pos) in slice.iter().rev() { - let buf = reader.read_blob(*pos, &ctx).await?; - let value = Value::des(&buf)?; - match value { - Value::Image(img) => { - reconstruct_state.img = Some((*entry_lsn, img)); - return Ok(ValueReconstructResult::Complete); - } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((*entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } - } - } - } - - // release lock on 'inner' - - // If an older page image is needed to reconstruct the page, let the - // caller know. - if need_image { - Ok(ValueReconstructResult::Continue) - } else { - Ok(ValueReconstructResult::Complete) - } - } - // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. // @@ -449,20 +385,17 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, + gate_guard: utils::sync::gate::GateGuard, ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?; + let file = + EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { file_id: key, - local_path_str: { - let mut buf = String::new(); - inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap(); - buf.into() - }, frozen_local_path_str: OnceLock::new(), conf, timeline_id, @@ -482,8 +415,7 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - - pub(crate) async fn put_value( + pub async fn put_value( &self, key: Key, lsn: Lsn, @@ -548,8 +480,6 @@ impl InMemoryLayer { /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive pub async fn freeze(&self, end_lsn: Lsn) { - let inner = self.inner.write().await; - assert!( self.start_lsn < end_lsn, "{} >= {}", @@ -567,9 +497,13 @@ impl InMemoryLayer { }) .expect("frozen_local_path_str set only once"); - for vec_map in inner.index.values() { - for (lsn, _pos) in vec_map.as_slice() { - assert!(*lsn < end_lsn); + #[cfg(debug_assertions)] + { + let inner = self.inner.write().await; + for vec_map in inner.index.values() { + for (lsn, _pos) in vec_map.as_slice() { + assert!(*lsn < end_lsn); + } } } } @@ -579,12 +513,12 @@ impl InMemoryLayer { /// if there are no matching keys. /// /// Returns a new delta layer with all the same data as this in-memory layer - pub(crate) async fn write_to_disk( + pub async fn write_to_disk( &self, - timeline: &Arc, ctx: &RequestContext, key_range: Option>, - ) -> Result> { + l0_flush_global_state: &l0_flush::Inner, + ) -> Result> { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception @@ -596,9 +530,8 @@ impl InMemoryLayer { // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().await; - let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone(); use l0_flush::Inner; - let _concurrency_permit = match &*l0_flush_global_state { + let _concurrency_permit = match l0_flush_global_state { Inner::PageCached => None, Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await), }; @@ -628,7 +561,7 @@ impl InMemoryLayer { ) .await?; - match &*l0_flush_global_state { + match l0_flush_global_state { l0_flush::Inner::PageCached => { let ctx = RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::InMemoryLayer) @@ -693,7 +626,7 @@ impl InMemoryLayer { } // MAX is used here because we identify L0 layers by full key range - let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?; + let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?; // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``. // @@ -705,6 +638,6 @@ impl InMemoryLayer { // we dirtied when writing to the filesystem have been flushed and marked !dirty. drop(_concurrency_permit); - Ok(Some(delta_layer)) + Ok(Some((desc, path))) } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 5732779e44..83450d24bb 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -24,8 +24,7 @@ use super::delta_layer::{self, DeltaEntry}; use super::image_layer::{self}; use super::{ AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, - LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, - ValuesReconstructState, + LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState, }; use utils::generation::Generation; @@ -301,42 +300,6 @@ impl Layer { self.0.delete_on_drop(); } - /// Return data needed to reconstruct given page at LSN. - /// - /// It is up to the caller to collect more data from the previous layer and - /// perform WAL redo, if necessary. - /// - /// # Cancellation-Safety - /// - /// This method is cancellation-safe. - pub(crate) async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_data: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> anyhow::Result { - use anyhow::ensure; - - let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?; - self.0.access_stats.record_access(ctx); - - if self.layer_desc().is_delta { - ensure!(lsn_range.start >= self.layer_desc().lsn_range.start); - ensure!(self.layer_desc().key_range.contains(&key)); - } else { - ensure!(self.layer_desc().key_range.contains(&key)); - ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn()); - ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn()); - } - - layer - .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx) - .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self)) - .await - .with_context(|| format!("get_value_reconstruct_data for layer {self}")) - } - pub(crate) async fn get_values_reconstruct_data( &self, keyspace: KeySpace, @@ -353,7 +316,7 @@ impl Layer { other => GetVectoredError::Other(anyhow::anyhow!(other)), })?; - self.0.access_stats.record_access(ctx); + self.record_access(ctx); layer .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) @@ -433,18 +396,18 @@ impl Layer { self.0.info(reset) } - pub(crate) fn access_stats(&self) -> &LayerAccessStats { - &self.0.access_stats + pub(crate) fn latest_activity(&self) -> SystemTime { + self.0.access_stats.latest_activity() + } + + pub(crate) fn visibility(&self) -> LayerVisibilityHint { + self.0.access_stats.visibility() } pub(crate) fn local_path(&self) -> &Utf8Path { &self.0.path } - pub(crate) fn debug_str(&self) -> &Arc { - &self.0.debug_str - } - pub(crate) fn metadata(&self) -> LayerFileMetadata { self.0.metadata() } @@ -488,13 +451,31 @@ impl Layer { } } + fn record_access(&self, ctx: &RequestContext) { + if self.0.access_stats.record_access(ctx) { + // Visibility was modified to Visible + tracing::info!( + "Layer {} became visible as a result of access", + self.0.desc.key() + ); + if let Some(tl) = self.0.timeline.upgrade() { + tl.metrics + .visible_physical_size_gauge + .add(self.0.desc.file_size) + } + } + } + pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) { - let old_visibility = self.access_stats().set_visibility(visibility.clone()); + let old_visibility = self.0.access_stats.set_visibility(visibility.clone()); use LayerVisibilityHint::*; match (old_visibility, visibility) { (Visible, Covered) => { // Subtract this layer's contribution to the visible size metric if let Some(tl) = self.0.timeline.upgrade() { + debug_assert!( + tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size + ); tl.metrics .visible_physical_size_gauge .sub(self.0.desc.file_size) @@ -519,7 +500,7 @@ impl Layer { /// /// However when we want something evicted, we cannot evict it right away as there might be current /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet -/// read with [`Layer::get_value_reconstruct_data`]. +/// read with [`Layer::get_values_reconstruct_data`]. /// /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search #[derive(Debug)] @@ -600,9 +581,6 @@ struct LayerInner { /// Full path to the file; unclear if this should exist anymore. path: Utf8PathBuf, - /// String representation of the layer, used for traversal id. - debug_str: Arc, - desc: PersistentLayerDesc, /// Timeline access is needed for remote timeline client and metrics. @@ -715,6 +693,9 @@ impl Drop for LayerInner { } if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) { + debug_assert!( + timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size + ); timeline .metrics .visible_physical_size_gauge @@ -836,9 +817,6 @@ impl LayerInner { LayerInner { conf, - debug_str: { - format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into() - }, path: local_path, desc, timeline: Arc::downgrade(timeline), @@ -1759,28 +1737,6 @@ impl DownloadedLayer { .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}")) } - async fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_data: &mut ValueReconstructState, - owner: &Arc, - ctx: &RequestContext, - ) -> anyhow::Result { - use LayerKind::*; - - match self.get(owner, ctx).await? { - Delta(d) => { - d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx) - .await - } - Image(i) => { - i.get_value_reconstruct_data(key, reconstruct_data, ctx) - .await - } - } - } - async fn get_values_reconstruct_data( &self, keyspace: KeySpace, @@ -1879,7 +1835,7 @@ impl ResidentLayer { // this is valid because the DownloadedLayer::kind is a OnceCell, not a // Mutex, so we cannot go and deinitialize the value with OnceCell::take // while it's being held. - owner.access_stats.record_access(ctx); + self.owner.record_access(ctx); delta_layer::DeltaLayerInner::load_keys(d, ctx) .await diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 423cde001c..bffd2db800 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -39,7 +39,7 @@ async fn smoke_test() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -50,13 +50,26 @@ async fn smoke_test() { // all layers created at pageserver are like `layer`, initialized with strong // Arc. + let controlfile_keyspace = KeySpace { + ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()], + }; + let img_before = { - let mut data = ValueReconstructState::default(); + let mut data = ValuesReconstructState::default(); layer - .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .get_values_reconstruct_data( + controlfile_keyspace.clone(), + Lsn(0x10)..Lsn(0x11), + &mut data, + &ctx, + ) .await .unwrap(); - data.img + data.keys + .remove(&CONTROLFILE_KEY) + .expect("must be present") + .expect("should not error") + .img .take() .expect("tenant harness writes the control file") }; @@ -74,13 +87,24 @@ async fn smoke_test() { // on accesses when the layer is evicted, it will automatically be downloaded. let img_after = { - let mut data = ValueReconstructState::default(); + let mut data = ValuesReconstructState::default(); layer - .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .get_values_reconstruct_data( + controlfile_keyspace.clone(), + Lsn(0x10)..Lsn(0x11), + &mut data, + &ctx, + ) .instrument(download_span.clone()) .await .unwrap(); - data.img.take().unwrap() + data.keys + .remove(&CONTROLFILE_KEY) + .expect("must be present") + .expect("should not error") + .img + .take() + .expect("tenant harness writes the control file") }; assert_eq!(img_before, img_after); @@ -152,7 +176,7 @@ async fn smoke_test() { { let layers = &[layer]; let mut g = timeline.layers.write().await; - g.finish_gc_timeline(layers); + g.open_mut().unwrap().finish_gc_timeline(layers); // this just updates the remote_physical_size for demonstration purposes rtc.schedule_gc_update(layers).unwrap(); } @@ -192,7 +216,7 @@ async fn evict_and_wait_on_wanted_deleted() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -236,7 +260,7 @@ async fn evict_and_wait_on_wanted_deleted() { // the deletion of the layer in remote_storage happens. { let mut layers = timeline.layers.write().await; - layers.finish_gc_timeline(&[layer]); + layers.open_mut().unwrap().finish_gc_timeline(&[layer]); } SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; @@ -277,7 +301,7 @@ fn read_wins_pending_eviction() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -409,7 +433,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -578,7 +602,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -658,7 +682,7 @@ async fn evict_and_wait_does_not_wait_for_download() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.likely_resident_layers().collect::>() + layers.likely_resident_layers().cloned().collect::>() }; assert_eq!(layers.len(), 1); @@ -777,9 +801,9 @@ async fn eviction_cancellation_on_drop() { let (evicted_layer, not_evicted) = { let mut layers = { let mut guard = timeline.layers.write().await; - let layers = guard.likely_resident_layers().collect::>(); + let layers = guard.likely_resident_layers().cloned().collect::>(); // remove the layers from layermap - guard.finish_gc_timeline(&layers); + guard.open_mut().unwrap().finish_gc_timeline(&layers); layers }; @@ -830,7 +854,7 @@ async fn eviction_cancellation_on_drop() { fn layer_size() { assert_eq!(size_of::(), 8); assert_eq!(size_of::(), 104); - assert_eq!(size_of::(), 312); + assert_eq!(size_of::(), 296); // it also has the utf8 path } diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs index a4091a890c..d7bfe48c60 100644 --- a/pageserver/src/tenant/storage_layer/split_writer.rs +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -1,12 +1,13 @@ -use std::sync::Arc; +use std::{ops::Range, sync::Arc}; use bytes::Bytes; use pageserver_api::key::{Key, KEY_SIZE}; use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId}; -use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline}; +use crate::tenant::storage_layer::Layer; +use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline}; -use super::{ImageLayerWriter, ResidentLayer}; +use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer}; /// An image writer that takes images and produces multiple image layers. The interface does not /// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files @@ -98,6 +99,111 @@ impl SplitImageLayerWriter { generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?); Ok(generated_layers) } + + /// When split writer fails, the caller should call this function and handle partially generated layers. + #[allow(dead_code)] + pub(crate) async fn take(self) -> anyhow::Result<(Vec, ImageLayerWriter)> { + Ok((self.generated_layers, self.inner)) + } +} + +/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not +/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files +/// to be cleaned up). +#[must_use] +pub struct SplitDeltaLayerWriter { + inner: DeltaLayerWriter, + target_layer_size: u64, + generated_layers: Vec, + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + lsn_range: Range, +} + +impl SplitDeltaLayerWriter { + pub async fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + start_key: Key, + lsn_range: Range, + target_layer_size: u64, + ctx: &RequestContext, + ) -> anyhow::Result { + Ok(Self { + target_layer_size, + inner: DeltaLayerWriter::new( + conf, + timeline_id, + tenant_shard_id, + start_key, + lsn_range.clone(), + ctx, + ) + .await?, + generated_layers: Vec::new(), + conf, + timeline_id, + tenant_shard_id, + lsn_range, + }) + } + + pub async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + tline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate + // number, and therefore the final layer size could be a little bit larger or smaller than the target. + let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */; + if self.inner.num_keys() >= 1 + && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size + { + let next_delta_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + self.lsn_range.clone(), + ctx, + ) + .await?; + let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer); + let (desc, path) = prev_delta_writer.finish(key, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + self.generated_layers.push(delta_layer); + } + self.inner.put_value(key, lsn, val, ctx).await + } + + pub(crate) async fn finish( + self, + tline: &Arc, + ctx: &RequestContext, + end_key: Key, + ) -> anyhow::Result> { + let Self { + mut generated_layers, + inner, + .. + } = self; + + let (desc, path) = inner.finish(end_key, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + generated_layers.push(delta_layer); + Ok(generated_layers) + } + + /// When split writer fails, the caller should call this function and handle partially generated layers. + #[allow(dead_code)] + pub(crate) async fn take(self) -> anyhow::Result<(Vec, DeltaLayerWriter)> { + Ok((self.generated_layers, self.inner)) + } } #[cfg(test)] @@ -138,7 +244,7 @@ mod tests { .await .unwrap(); - let mut writer = SplitImageLayerWriter::new( + let mut image_writer = SplitImageLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, @@ -150,11 +256,42 @@ mod tests { .await .unwrap(); - writer + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + + image_writer .put_image(get_key(0), get_img(0), &tline, &ctx) .await .unwrap(); - let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap(); + let layers = image_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 1); + + delta_writer + .put_value( + get_key(0), + Lsn(0x18), + Value::Image(get_img(0)), + &tline, + &ctx, + ) + .await + .unwrap(); + let layers = delta_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); assert_eq!(layers.len(), 1); } @@ -170,7 +307,7 @@ mod tests { .await .unwrap(); - let mut writer = SplitImageLayerWriter::new( + let mut image_writer = SplitImageLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, @@ -181,26 +318,58 @@ mod tests { ) .await .unwrap(); + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); const N: usize = 2000; for i in 0..N { let i = i as u32; - writer + image_writer .put_image(get_key(i), get_large_img(), &tline, &ctx) .await .unwrap(); + delta_writer + .put_value( + get_key(i), + Lsn(0x20), + Value::Image(get_large_img()), + &tline, + &ctx, + ) + .await + .unwrap(); } - let layers = writer + let image_layers = image_writer .finish(&tline, &ctx, get_key(N as u32)) .await .unwrap(); - assert_eq!(layers.len(), N / 512 + 1); - for idx in 0..layers.len() { - assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN); - assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX); + let delta_layers = delta_writer + .finish(&tline, &ctx, get_key(N as u32)) + .await + .unwrap(); + assert_eq!(image_layers.len(), N / 512 + 1); + assert_eq!(delta_layers.len(), N / 512 + 1); + for idx in 0..image_layers.len() { + assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN); + assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX); + assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN); + assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX); if idx > 0 { assert_eq!( - layers[idx - 1].layer_desc().key_range.end, - layers[idx].layer_desc().key_range.start + image_layers[idx - 1].layer_desc().key_range.end, + image_layers[idx].layer_desc().key_range.start + ); + assert_eq!( + delta_layers[idx - 1].layer_desc().key_range.end, + delta_layers[idx].layer_desc().key_range.start ); } } @@ -218,7 +387,7 @@ mod tests { .await .unwrap(); - let mut writer = SplitImageLayerWriter::new( + let mut image_writer = SplitImageLayerWriter::new( tenant.conf, tline.timeline_id, tenant.tenant_shard_id, @@ -230,15 +399,56 @@ mod tests { .await .unwrap(); - writer + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024, + &ctx, + ) + .await + .unwrap(); + + image_writer .put_image(get_key(0), get_img(0), &tline, &ctx) .await .unwrap(); - writer + image_writer .put_image(get_key(1), get_large_img(), &tline, &ctx) .await .unwrap(); - let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap(); + let layers = image_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 2); + + delta_writer + .put_value( + get_key(0), + Lsn(0x18), + Value::Image(get_img(0)), + &tline, + &ctx, + ) + .await + .unwrap(); + delta_writer + .put_value( + get_key(1), + Lsn(0x1A), + Value::Image(get_large_img()), + &tline, + &ctx, + ) + .await + .unwrap(); + let layers = delta_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); assert_eq!(layers.len(), 2); } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 230362d81a..b4706ea59d 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -407,9 +407,16 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { error_run_count += 1; let wait_duration = Duration::from_secs_f64(wait_duration); - error!( - "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}", - ); + if matches!(e, crate::tenant::GcError::TimelineCancelled) { + // Timeline was cancelled during gc. We might either be in an event + // that affects the entire tenant (tenant deletion, pageserver shutdown), + // or in one that affects the timeline only (timeline deletion). + // Therefore, don't exit the loop. + info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); + } else { + error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); + } + wait_duration } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index be72e15c19..f810df5a56 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -22,8 +22,8 @@ use handle::ShardTimelineId; use once_cell::sync::Lazy; use pageserver_api::{ key::{ - AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, - NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, + KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, + NON_INHERITED_SPARSE_RANGE, }, keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ @@ -59,10 +59,7 @@ use std::{ collections::{BTreeMap, HashMap, HashSet}, sync::atomic::AtomicU64, }; -use std::{ - cmp::{max, min}, - ops::ControlFlow, -}; +use std::{cmp::min, ops::ControlFlow}; use std::{ collections::btree_map::Entry, ops::{Deref, Range}, @@ -87,8 +84,8 @@ use crate::{ disk_usage_eviction_task::finite_f32, tenant::storage_layer::{ AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult, - ValueReconstructState, ValuesReconstructState, + LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState, + ValuesReconstructState, }, }; use crate::{ @@ -140,7 +137,7 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::{config::TenantConf, upload_queue::NotInitialized}; +use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; use super::{ @@ -530,6 +527,12 @@ pub(crate) enum PageReconstructError { MissingKey(MissingKeyError), } +impl From for PageReconstructError { + fn from(_: layer_manager::Shutdown) -> Self { + PageReconstructError::Cancelled + } +} + impl GetVectoredError { #[cfg(test)] pub(crate) fn is_missing_key_error(&self) -> bool { @@ -537,13 +540,18 @@ impl GetVectoredError { } } +impl From for GetVectoredError { + fn from(_: layer_manager::Shutdown) -> Self { + GetVectoredError::Cancelled + } +} + pub struct MissingKeyError { key: Key, shard: ShardNumber, cont_lsn: Lsn, request_lsn: Lsn, ancestor_lsn: Option, - traversal_path: Vec, backtrace: Option, } @@ -564,18 +572,6 @@ impl std::fmt::Display for MissingKeyError { write!(f, ", ancestor {}", ancestor_lsn)?; } - if !self.traversal_path.is_empty() { - writeln!(f)?; - } - - for (r, c, l) in &self.traversal_path { - writeln!( - f, - "layer traversal: result {:?}, cont_lsn {}, layer: {}", - r, c, l, - )?; - } - if let Some(ref backtrace) = self.backtrace { write!(f, "\n{}", backtrace)?; } @@ -613,6 +609,12 @@ pub(crate) enum CreateImageLayersError { Other(#[from] anyhow::Error), } +impl From for CreateImageLayersError { + fn from(_: layer_manager::Shutdown) -> Self { + CreateImageLayersError::Cancelled + } +} + #[derive(thiserror::Error, Debug, Clone)] pub(crate) enum FlushLayerError { /// Timeline cancellation token was cancelled @@ -650,6 +652,12 @@ impl FlushLayerError { } } +impl From for FlushLayerError { + fn from(_: layer_manager::Shutdown) -> Self { + FlushLayerError::Cancelled + } +} + #[derive(thiserror::Error, Debug)] pub(crate) enum GetVectoredError { #[error("timeline shutting down")] @@ -704,6 +712,7 @@ pub(crate) enum CompactFlags { ForceRepartition, ForceImageLayerCreation, EnhancedGcBottomMostCompaction, + DryRun, } impl std::fmt::Debug for Timeline { @@ -917,119 +926,44 @@ impl Timeline { self.timeline_get_throttle.throttle(ctx, 1).await; - match self.conf.get_impl { - GetImpl::Legacy => { - let reconstruct_state = ValueReconstructState { - records: Vec::new(), - img: None, - }; + let keyspace = KeySpace { + ranges: vec![key..key.next()], + }; - self.get_impl(key, lsn, reconstruct_state, ctx).await - } - GetImpl::Vectored => { - let keyspace = KeySpace { - ranges: vec![key..key.next()], - }; + // Initialise the reconstruct state for the key with the cache + // entry returned above. + let mut reconstruct_state = ValuesReconstructState::new(); - // Initialise the reconstruct state for the key with the cache - // entry returned above. - let mut reconstruct_state = ValuesReconstructState::new(); + let vectored_res = self + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .await; - let vectored_res = self - .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) - .await; - - if self.conf.validate_vectored_get { - self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) - .await; - } - - let key_value = vectored_res?.pop_first(); - match key_value { - Some((got_key, value)) => { - if got_key != key { - error!( - "Expected {}, but singular vectored get returned {}", - key, got_key - ); - Err(PageReconstructError::Other(anyhow!( - "Singular vectored get returned wrong key" - ))) - } else { - value - } - } - None => Err(PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn: Lsn(0), - request_lsn: lsn, - ancestor_lsn: None, - traversal_path: Vec::new(), - backtrace: None, - })), + let key_value = vectored_res?.pop_first(); + match key_value { + Some((got_key, value)) => { + if got_key != key { + error!( + "Expected {}, but singular vectored get returned {}", + key, got_key + ); + Err(PageReconstructError::Other(anyhow!( + "Singular vectored get returned wrong key" + ))) + } else { + value } } + None => Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn: Lsn(0), + request_lsn: lsn, + ancestor_lsn: None, + backtrace: None, + })), } } - /// Not subject to [`Self::timeline_get_throttle`]. - async fn get_impl( - &self, - key: Key, - lsn: Lsn, - mut reconstruct_state: ValueReconstructState, - ctx: &RequestContext, - ) -> Result { - // XXX: structured stats collection for layer eviction here. - trace!( - "get page request for {}@{} from task kind {:?}", - key, - lsn, - ctx.task_kind() - ); - - let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME - .for_get_kind(GetKind::Singular) - .start_timer(); - let path = self - .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) - .await?; - timer.stop_and_record(); - - let start = Instant::now(); - let res = self.reconstruct_value(key, lsn, reconstruct_state).await; - let elapsed = start.elapsed(); - crate::metrics::RECONSTRUCT_TIME - .for_get_kind(GetKind::Singular) - .observe(elapsed.as_secs_f64()); - - if cfg!(feature = "testing") - && res.is_err() - && !matches!(res, Err(PageReconstructError::Cancelled)) - { - // it can only be walredo issue - use std::fmt::Write; - - let mut msg = String::new(); - - path.into_iter().for_each(|(res, cont_lsn, layer)| { - writeln!( - msg, - "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}", - layer, - ) - .expect("string grows") - }); - - // this is to rule out or provide evidence that we could in some cases read a duplicate - // walrecord - tracing::info!("walredo failed, path:\n{msg}"); - } - - res - } - pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0; @@ -1079,28 +1013,14 @@ impl Timeline { .throttle(ctx, key_count as usize) .await; - let res = match self.conf.get_vectored_impl { - GetVectoredImpl::Sequential => { - self.get_vectored_sequential_impl(keyspace, lsn, ctx).await - } - GetVectoredImpl::Vectored => { - let vectored_res = self - .get_vectored_impl( - keyspace.clone(), - lsn, - &mut ValuesReconstructState::new(), - ctx, - ) - .await; - - if self.conf.validate_vectored_get { - self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) - .await; - } - - vectored_res - } - }; + let res = self + .get_vectored_impl( + keyspace.clone(), + lsn, + &mut ValuesReconstructState::new(), + ctx, + ) + .await; if let Some((metric, start)) = start { let elapsed = start.elapsed(); @@ -1189,65 +1109,6 @@ impl Timeline { vectored_res } - /// Not subject to [`Self::timeline_get_throttle`]. - pub(super) async fn get_vectored_sequential_impl( - &self, - keyspace: KeySpace, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result>, GetVectoredError> { - let mut values = BTreeMap::new(); - - for range in keyspace.ranges { - let mut key = range.start; - while key != range.end { - let block = self - .get_impl(key, lsn, ValueReconstructState::default(), ctx) - .await; - - use PageReconstructError::*; - match block { - Err(Cancelled) => return Err(GetVectoredError::Cancelled), - Err(MissingKey(_)) - if NON_INHERITED_RANGE.contains(&key) - || NON_INHERITED_SPARSE_RANGE.contains(&key) => - { - // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range. - // When we add more types of keys into the page server, we should revisit this part of code and throw errors - // accordingly. - key = key.next(); - } - Err(MissingKey(err)) => { - return Err(GetVectoredError::MissingKey(err)); - } - Err(Other(err)) - if err - .to_string() - .contains("downloading evicted layer file failed") => - { - return Err(GetVectoredError::Other(err)) - } - Err(Other(err)) - if err - .chain() - .any(|cause| cause.to_string().contains("layer loading failed")) => - { - // The intent here is to achieve error parity with the vectored read path. - // When vectored read fails to load a layer it fails the whole read, hence - // we mimic this behaviour here to keep the validation happy. - return Err(GetVectoredError::Other(err)); - } - _ => { - values.insert(key, block); - key = key.next(); - } - } - } - } - - Ok(values) - } - pub(super) async fn get_vectored_impl( &self, keyspace: KeySpace, @@ -1318,113 +1179,6 @@ impl Timeline { Ok(results) } - /// Not subject to [`Self::timeline_get_throttle`]. - pub(super) async fn validate_get_vectored_impl( - &self, - vectored_res: &Result>, GetVectoredError>, - keyspace: KeySpace, - lsn: Lsn, - ctx: &RequestContext, - ) { - if keyspace.overlaps(&Key::metadata_key_range()) { - // skip validation for metadata key range - return; - } - - let sequential_res = self - .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx) - .await; - - fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool { - use GetVectoredError::*; - match (lhs, rhs) { - (Oversized(l), Oversized(r)) => l == r, - (InvalidLsn(l), InvalidLsn(r)) => l == r, - (MissingKey(l), MissingKey(r)) => l.key == r.key, - (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true, - (Other(_), Other(_)) => true, - _ => false, - } - } - - match (&sequential_res, vectored_res) { - (Err(GetVectoredError::Cancelled), _) => {}, - (_, Err(GetVectoredError::Cancelled)) => {}, - (Err(seq_err), Ok(_)) => { - panic!(concat!("Sequential get failed with {}, but vectored get did not", - " - keyspace={:?} lsn={}"), - seq_err, keyspace, lsn) }, - (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => { - // Sequential get runs after vectored get, so it is possible for the later - // to time out while waiting for its ancestor's Lsn to become ready and for the - // former to succeed (it essentially has a doubled wait time). - }, - (Ok(_), Err(vec_err)) => { - panic!(concat!("Vectored get failed with {}, but sequential get did not", - " - keyspace={:?} lsn={}"), - vec_err, keyspace, lsn) }, - (Err(seq_err), Err(vec_err)) => { - assert!(errors_match(seq_err, vec_err), - "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")}, - (Ok(seq_values), Ok(vec_values)) => { - seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| { - assert_eq!(seq_key, vec_key); - match (seq_res, vec_res) { - (Ok(seq_blob), Ok(vec_blob)) => { - Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob); - }, - (Err(err), Ok(_)) => { - panic!( - concat!("Sequential get failed with {} for key {}, but vectored get did not", - " - keyspace={:?} lsn={}"), - err, seq_key, keyspace, lsn) }, - (Ok(_), Err(err)) => { - panic!( - concat!("Vectored get failed with {} for key {}, but sequential get did not", - " - keyspace={:?} lsn={}"), - err, seq_key, keyspace, lsn) }, - (Err(_), Err(_)) => {} - } - }) - } - } - } - - fn validate_key_equivalence( - key: &Key, - keyspace: &KeySpace, - lsn: Lsn, - seq: &Bytes, - vec: &Bytes, - ) { - if *key == AUX_FILES_KEY { - // The value reconstruct of AUX_FILES_KEY from records is not deterministic - // since it uses a hash map under the hood. Hence, deserialise both results - // before comparing. - let seq_aux_dir_res = AuxFilesDirectory::des(seq); - let vec_aux_dir_res = AuxFilesDirectory::des(vec); - match (&seq_aux_dir_res, &vec_aux_dir_res) { - (Ok(seq_aux_dir), Ok(vec_aux_dir)) => { - assert_eq!( - seq_aux_dir, vec_aux_dir, - "Mismatch for key {} - keyspace={:?} lsn={}", - key, keyspace, lsn - ); - } - (Err(_), Err(_)) => {} - _ => { - panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}"); - } - } - } else { - // All other keys should reconstruct deterministically, so we simply compare the blobs. - assert_eq!( - seq, vec, - "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}" - ); - } - } - /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. pub(crate) fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last @@ -1468,12 +1222,7 @@ impl Timeline { /// Hence, the result **does not represent local filesystem usage**. pub(crate) async fn layer_size_sum(&self) -> u64 { let guard = self.layers.read().await; - let layer_map = guard.layer_map(); - let mut size = 0; - for l in layer_map.iter_historic_layers() { - size += l.file_size; - } - size + guard.layer_size_sum() } pub(crate) fn resident_physical_size(&self) -> u64 { @@ -1640,16 +1389,15 @@ impl Timeline { // This exists to provide a non-span creating version of `freeze_and_flush` we can call without // polluting the span hierarchy. pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> { - let to_lsn = { + let token = { // Freeze the current open in-memory layer. It will be written to disk on next // iteration. let mut g = self.write_lock.lock().await; let to_lsn = self.get_last_record_lsn(); - self.freeze_inmem_layer_at(to_lsn, &mut g).await; - to_lsn + self.freeze_inmem_layer_at(to_lsn, &mut g).await? }; - self.flush_frozen_layers_and_wait(to_lsn).await + self.wait_flush_completion(token).await } // Check if an open ephemeral layer should be closed: this provides @@ -1663,12 +1411,20 @@ impl Timeline { return; }; + // FIXME: why not early exit? because before #7927 the state would had been cleared every + // time, and this was missed. + // if write_guard.is_none() { return; } + let Ok(layers_guard) = self.layers.try_read() else { // Don't block if the layer lock is busy return; }; - let Some(open_layer) = &layers_guard.layer_map().open_layer else { + let Ok(lm) = layers_guard.layer_map() else { + return; + }; + + let Some(open_layer) = &lm.open_layer else { // If there is no open layer, we have no layer freezing to do. However, we might need to generate // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions // that didn't result in writes to this shard. @@ -1694,9 +1450,16 @@ impl Timeline { ); // The flush loop will update remote consistent LSN as well as disk consistent LSN. - self.flush_frozen_layers_and_wait(last_record_lsn) - .await - .ok(); + // We know there is no open layer, so we can request freezing without actually + // freezing anything. This is true even if we have dropped the layers_guard, we + // still hold the write_guard. + let _ = async { + let token = self + .freeze_inmem_layer_at(last_record_lsn, &mut write_guard) + .await?; + self.wait_flush_completion(token).await + } + .await; } } @@ -1734,33 +1497,26 @@ impl Timeline { self.last_freeze_at.load(), open_layer.get_opened_at(), ) { - let at_lsn = match open_layer.info() { + match open_layer.info() { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { // We may reach this point if the layer was already frozen by not yet flushed: flushing // happens asynchronously in the background. tracing::debug!( "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})" ); - None } InMemoryLayerInfo::Open { .. } => { // Upgrade to a write lock and freeze the layer drop(layers_guard); - let mut layers_guard = self.layers.write().await; - let froze = layers_guard - .try_freeze_in_memory_layer( - current_lsn, - &self.last_freeze_at, - &mut write_guard, - ) + let res = self + .freeze_inmem_layer_at(current_lsn, &mut write_guard) .await; - Some(current_lsn).filter(|_| froze) - } - }; - if let Some(lsn) = at_lsn { - let res: Result = self.flush_frozen_layers(lsn); - if let Err(e) = res { - tracing::info!("failed to flush frozen layer after background freeze: {e:#}"); + + if let Err(e) = res { + tracing::info!( + "failed to flush frozen layer after background freeze: {e:#}" + ); + } } } } @@ -1914,6 +1670,11 @@ impl Timeline { // about corner cases like s3 suddenly hanging up? self.remote_client.shutdown().await; } + Err(FlushLayerError::Cancelled) => { + // this is likely the second shutdown, ignore silently. + // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 + debug_assert!(self.cancel.is_cancelled()); + } Err(e) => { // Non-fatal. Shutdown is infallible. Failures to flush just mean that // we have some extra WAL replay to do next time the timeline starts. @@ -1932,6 +1693,7 @@ impl Timeline { // Transition the remote_client into a state where it's only useful for timeline deletion. // (The deletion use case is why we can't just hook up remote_client to Self::cancel).) self.remote_client.stop(); + // As documented in remote_client.stop()'s doc comment, it's our responsibility // to shut down the upload queue tasks. // TODO: fix that, task management should be encapsulated inside remote_client. @@ -1942,10 +1704,17 @@ impl Timeline { ) .await; - // TODO: work toward making this a no-op. See this funciton's doc comment for more context. + // TODO: work toward making this a no-op. See this function's doc comment for more context. tracing::debug!("Waiting for tasks..."); task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; + { + // Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate + // open. + let mut write_guard = self.write_lock.lock().await; + self.layers.write().await.shutdown(&mut write_guard); + } + // Finally wait until any gate-holders are complete. // // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks @@ -2039,9 +1808,12 @@ impl Timeline { } } - pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { + pub(crate) async fn layer_map_info( + &self, + reset: LayerAccessStatsReset, + ) -> Result { let guard = self.layers.read().await; - let layer_map = guard.layer_map(); + let layer_map = guard.layer_map()?; let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); if let Some(open_layer) = &layer_map.open_layer { in_memory_layers.push(open_layer.info()); @@ -2050,16 +1822,15 @@ impl Timeline { in_memory_layers.push(frozen_layer.info()); } - let mut historic_layers = Vec::new(); - for historic_layer in layer_map.iter_historic_layers() { - let historic_layer = guard.get_from_desc(&historic_layer); - historic_layers.push(historic_layer.info(reset)); - } + let historic_layers = layer_map + .iter_historic_layers() + .map(|desc| guard.get_from_desc(&desc).info(reset)) + .collect(); - LayerMapInfo { + Ok(LayerMapInfo { in_memory_layers, historic_layers, - } + }) } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] @@ -2067,7 +1838,7 @@ impl Timeline { &self, layer_file_name: &LayerName, ) -> anyhow::Result> { - let Some(layer) = self.find_layer(layer_file_name).await else { + let Some(layer) = self.find_layer(layer_file_name).await? else { return Ok(None); }; @@ -2088,7 +1859,7 @@ impl Timeline { .enter() .map_err(|_| anyhow::anyhow!("Shutting down"))?; - let Some(local_layer) = self.find_layer(layer_file_name).await else { + let Some(local_layer) = self.find_layer(layer_file_name).await? else { return Ok(None); }; @@ -2574,7 +2345,10 @@ impl Timeline { let mut layers = self.layers.try_write().expect( "in the context where we call this function, no other task has access to the object", ); - layers.initialize_empty(Lsn(start_lsn.0)); + layers + .open_mut() + .expect("in this context the LayerManager must still be open") + .initialize_empty(Lsn(start_lsn.0)); } /// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only @@ -2706,7 +2480,10 @@ impl Timeline { let num_layers = loaded_layers.len(); - guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); + guard + .open_mut() + .expect("layermanager must be open during init") + .initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); self.remote_client .schedule_layer_file_deletion(&needs_cleanup)?; @@ -2741,7 +2518,7 @@ impl Timeline { // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan) drop(guard); // drop write lock, update_layer_visibility will take a read lock. - self.update_layer_visibility().await; + self.update_layer_visibility().await?; info!( "loaded layer map with {} layers at {}, total physical size: {}", @@ -3163,16 +2940,17 @@ impl Timeline { } } - async fn find_layer(&self, layer_name: &LayerName) -> Option { + async fn find_layer( + &self, + layer_name: &LayerName, + ) -> Result, layer_manager::Shutdown> { let guard = self.layers.read().await; - for historic_layer in guard.layer_map().iter_historic_layers() { - let historic_layer_name = historic_layer.layer_name(); - if layer_name == &historic_layer_name { - return Some(guard.get_from_desc(&historic_layer)); - } - } - - None + let layer = guard + .layer_map()? + .iter_historic_layers() + .find(|l| &l.layer_name() == layer_name) + .map(|found| guard.get_from_desc(&found)); + Ok(layer) } /// The timeline heatmap is a hint to secondary locations from the primary location, @@ -3189,14 +2967,22 @@ impl Timeline { let guard = self.layers.read().await; - let resident = guard.likely_resident_layers().map(|layer| { - let last_activity_ts = layer.access_stats().latest_activity(); - - HeatMapLayer::new( - layer.layer_desc().layer_name(), - layer.metadata(), - last_activity_ts, - ) + let resident = guard.likely_resident_layers().filter_map(|layer| { + match layer.visibility() { + LayerVisibilityHint::Visible => { + // Layer is visible to one or more read LSNs: elegible for inclusion in layer map + let last_activity_ts = layer.latest_activity(); + Some(HeatMapLayer::new( + layer.layer_desc().layer_name(), + layer.metadata(), + last_activity_ts, + )) + } + LayerVisibilityHint::Covered => { + // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap. + None + } + } }); let layers = resident.collect(); @@ -3214,228 +3000,8 @@ impl Timeline { } } -type TraversalId = Arc; - -trait TraversalLayerExt { - fn traversal_id(&self) -> TraversalId; -} - -impl TraversalLayerExt for Layer { - fn traversal_id(&self) -> TraversalId { - Arc::clone(self.debug_str()) - } -} - -impl TraversalLayerExt for Arc { - fn traversal_id(&self) -> TraversalId { - Arc::clone(self.local_path_str()) - } -} - impl Timeline { - /// - /// Get a handle to a Layer for reading. - /// - /// The returned Layer might be from an ancestor timeline, if the - /// segment hasn't been updated on this timeline yet. - /// - /// This function takes the current timeline's locked LayerMap as an argument, - /// so callers can avoid potential race conditions. - /// - /// # Cancel-Safety - /// - /// This method is cancellation-safe. - async fn get_reconstruct_data( - &self, - key: Key, - request_lsn: Lsn, - reconstruct_state: &mut ValueReconstructState, - ctx: &RequestContext, - ) -> Result, PageReconstructError> { - // Start from the current timeline. - let mut timeline_owned; - let mut timeline = self; - - let mut read_count = scopeguard::guard(0, |cnt| { - crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64) - }); - - // For debugging purposes, collect the path of layers that we traversed - // through. It's included in the error message if we fail to find the key. - let mut traversal_path = Vec::::new(); - - let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { - *cached_lsn - } else { - Lsn(0) - }; - - // 'prev_lsn' tracks the last LSN that we were at in our search. It's used - // to check that each iteration make some progress, to break infinite - // looping if something goes wrong. - let mut prev_lsn = None; - - let mut result = ValueReconstructResult::Continue; - let mut cont_lsn = Lsn(request_lsn.0 + 1); - - 'outer: loop { - if self.cancel.is_cancelled() { - return Err(PageReconstructError::Cancelled); - } - - // The function should have updated 'state' - //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); - match result { - ValueReconstructResult::Complete => return Ok(traversal_path), - ValueReconstructResult::Continue => { - // If we reached an earlier cached page image, we're done. - if cont_lsn == cached_lsn + 1 { - return Ok(traversal_path); - } - if let Some(prev) = prev_lsn { - if prev <= cont_lsn { - // Didn't make any progress in last iteration. Error out to avoid - // getting stuck in the loop. - return Err(PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn: Lsn(cont_lsn.0 - 1), - request_lsn, - ancestor_lsn: Some(timeline.ancestor_lsn), - traversal_path, - backtrace: None, - })); - } - } - prev_lsn = Some(cont_lsn); - } - ValueReconstructResult::Missing => { - return Err(PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn, - request_lsn, - ancestor_lsn: None, - traversal_path, - backtrace: if cfg!(test) { - Some(std::backtrace::Backtrace::force_capture()) - } else { - None - }, - })); - } - } - - // Recurse into ancestor if needed - if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() { - if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - trace!( - "going into ancestor {}, cont_lsn is {}", - timeline.ancestor_lsn, - cont_lsn - ); - - timeline_owned = timeline - .get_ready_ancestor_timeline(ancestor_timeline, ctx) - .await?; - timeline = &*timeline_owned; - prev_lsn = None; - continue 'outer; - } - } - - let guard = timeline.layers.read().await; - let layers = guard.layer_map(); - - // Check the open and frozen in-memory layers first, in order from newest - // to oldest. - if let Some(open_layer) = &layers.open_layer { - let start_lsn = open_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display()); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, start_lsn); - - let open_layer = open_layer.clone(); - drop(guard); - - result = match open_layer - .get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - ctx, - ) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - *read_count += 1; - traversal_path.push((result, cont_lsn, open_layer.traversal_id())); - continue 'outer; - } - } - for frozen_layer in layers.frozen_layers.iter().rev() { - let start_lsn = frozen_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display()); - let lsn_floor = max(cached_lsn + 1, start_lsn); - - let frozen_layer = frozen_layer.clone(); - drop(guard); - - result = match frozen_layer - .get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - ctx, - ) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - *read_count += 1; - traversal_path.push((result, cont_lsn, frozen_layer.traversal_id())); - continue 'outer; - } - } - - if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { - let layer = guard.get_from_desc(&layer); - drop(guard); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = match layer - .get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state, ctx) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - *read_count += 1; - traversal_path.push((result, cont_lsn, layer.traversal_id())); - continue 'outer; - } else if timeline.ancestor_timeline.is_some() { - // Nothing on this timeline. Traverse to parent - result = ValueReconstructResult::Continue; - cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); - continue 'outer; - } else { - // Nothing found - result = ValueReconstructResult::Missing; - continue 'outer; - } - } - } - + #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace /// @@ -3529,7 +3095,6 @@ impl Timeline { cont_lsn, request_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), - traversal_path: vec![], backtrace: None, })); } @@ -3588,7 +3153,7 @@ impl Timeline { // which turns out to be a perf bottleneck in some cases. if !unmapped_keyspace.is_empty() { let guard = timeline.layers.read().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; let in_memory_layer = layers.find_in_memory_layer(|l| { let start_lsn = l.get_lsn_range().start; @@ -3721,10 +3286,6 @@ impl Timeline { Ok(ancestor.clone()) } - pub(crate) fn get_ancestor_timeline(&self) -> Option> { - self.ancestor_timeline.clone() - } - pub(crate) fn get_shard_identity(&self) -> &ShardIdentity { &self.shard_identity } @@ -3740,22 +3301,35 @@ impl Timeline { } } + /// Returns a non-frozen open in-memory layer for ingestion. /// - /// Get a handle to the latest layer for appending. - /// + /// Takes a witness of timeline writer state lock being held, because it makes no sense to call + /// this function without holding the mutex. async fn get_layer_for_write( &self, lsn: Lsn, + _guard: &tokio::sync::MutexGuard<'_, Option>, ctx: &RequestContext, ) -> anyhow::Result> { let mut guard = self.layers.write().await; + let gate_guard = self.gate.enter().context("enter gate for inmem layer")?; + + let last_record_lsn = self.get_last_record_lsn(); + ensure!( + lsn > last_record_lsn, + "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", + lsn, + last_record_lsn, + ); + let layer = guard + .open_mut()? .get_layer_for_write( lsn, - self.get_last_record_lsn(), self.conf, self.timeline_id, self.tenant_shard_id, + gate_guard, ctx, ) .await?; @@ -3769,21 +3343,48 @@ impl Timeline { self.last_record_lsn.advance(new_lsn); } + /// Freeze any existing open in-memory layer and unconditionally notify the flush loop. + /// + /// Unconditional flush loop notification is given because in sharded cases we will want to + /// leave an Lsn gap. Unsharded tenants do not have Lsn gaps. async fn freeze_inmem_layer_at( &self, at: Lsn, write_lock: &mut tokio::sync::MutexGuard<'_, Option>, - ) { + ) -> Result { let frozen = { let mut guard = self.layers.write().await; guard + .open_mut()? .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock) .await }; + if frozen { let now = Instant::now(); *(self.last_freeze_ts.write().unwrap()) = now; } + + // Increment the flush cycle counter and wake up the flush task. + // Remember the new value, so that when we listen for the flush + // to finish, we know when the flush that we initiated has + // finished, instead of some other flush that was started earlier. + let mut my_flush_request = 0; + + let flush_loop_state = { *self.flush_loop_state.lock().unwrap() }; + if !matches!(flush_loop_state, FlushLoopState::Running { .. }) { + return Err(FlushLayerError::NotRunning(flush_loop_state)); + } + + self.layer_flush_start_tx.send_modify(|(counter, lsn)| { + my_flush_request = *counter + 1; + *counter = my_flush_request; + *lsn = std::cmp::max(at, *lsn); + }); + + assert_ne!(my_flush_request, 0); + + Ok(my_flush_request) } /// Layer flusher task's main loop. @@ -3820,7 +3421,11 @@ impl Timeline { let layer_to_flush = { let guard = self.layers.read().await; - guard.layer_map().frozen_layers.front().cloned() + let Ok(lm) = guard.layer_map() else { + info!("dropping out of flush loop for timeline shutdown"); + return; + }; + lm.frozen_layers.front().cloned() // drop 'layers' lock to allow concurrent reads and writes }; let Some(layer_to_flush) = layer_to_flush else { @@ -3877,34 +3482,7 @@ impl Timeline { } } - /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk. - /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`]. - /// - /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the - /// case, it means no data will be written between the top of the highest frozen layer and - /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data - /// locally for that part of the WAL. - fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result { - // Increment the flush cycle counter and wake up the flush task. - // Remember the new value, so that when we listen for the flush - // to finish, we know when the flush that we initiated has - // finished, instead of some other flush that was started earlier. - let mut my_flush_request = 0; - - let flush_loop_state = { *self.flush_loop_state.lock().unwrap() }; - if !matches!(flush_loop_state, FlushLoopState::Running { .. }) { - return Err(FlushLayerError::NotRunning(flush_loop_state)); - } - - self.layer_flush_start_tx.send_modify(|(counter, lsn)| { - my_flush_request = *counter + 1; - *counter = my_flush_request; - *lsn = std::cmp::max(at_lsn, *lsn); - }); - - Ok(my_flush_request) - } - + /// Waits any flush request created by [`Self::freeze_inmem_layer_at`] to complete. async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> { let mut rx = self.layer_flush_done_tx.subscribe(); loop { @@ -3937,11 +3515,6 @@ impl Timeline { } } - async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> { - let token = self.flush_frozen_layers(at_lsn)?; - self.wait_flush_completion(token).await - } - /// Flush one frozen in-memory layer to disk, as a new delta layer. /// /// Return value is the last lsn (inclusive) of the layer that was frozen. @@ -4078,11 +3651,11 @@ impl Timeline { { let mut guard = self.layers.write().await; - if self.cancel.is_cancelled() { - return Err(FlushLayerError::Cancelled); - } - - guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics); + guard.open_mut()?.finish_flush_l0_layer( + delta_layer_to_add.as_ref(), + &frozen_layer, + &self.metrics, + ); if self.set_disk_consistent_lsn(disk_consistent_lsn) { // Schedule remote uploads that will reflect our new disk_consistent_lsn @@ -4122,17 +3695,11 @@ impl Timeline { /// Return true if the value changed /// - /// This function must only be used from the layer flush task, and may not be called concurrently. + /// This function must only be used from the layer flush task. fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { - // We do a simple load/store cycle: that's why this function isn't safe for concurrent use. - let old_value = self.disk_consistent_lsn.load(); - if new_value != old_value { - assert!(new_value >= old_value); - self.disk_consistent_lsn.store(new_value); - true - } else { - false - } + let old_value = self.disk_consistent_lsn.fetch_max(new_value); + assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"); + new_value != old_value } /// Update metadata file @@ -4199,12 +3766,14 @@ impl Timeline { let frozen_layer = Arc::clone(frozen_layer); let ctx = ctx.attached_child(); let work = async move { - let Some(new_delta) = frozen_layer - .write_to_disk(&self_clone, &ctx, key_range) + let Some((desc, path)) = frozen_layer + .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner()) .await? else { return Ok(None); }; + let new_delta = Layer::finish_creating(self_clone.conf, &self_clone, desc, &path)?; + // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. @@ -4294,7 +3863,9 @@ impl Timeline { let threshold = self.get_image_creation_threshold(); let guard = self.layers.read().await; - let layers = guard.layer_map(); + let Ok(layers) = guard.layer_map() else { + return false; + }; let mut max_deltas = 0; for part_range in &partition.ranges { @@ -4702,13 +4273,16 @@ impl Timeline { let mut guard = self.layers.write().await; // FIXME: we could add the images to be uploaded *before* returning from here, but right - // now they are being scheduled outside of write lock - guard.track_new_image_layers(&image_layers, &self.metrics); + // now they are being scheduled outside of write lock; current way is inconsistent with + // compaction lock order. + guard + .open_mut()? + .track_new_image_layers(&image_layers, &self.metrics); drop_wlock(guard); timer.stop_and_record(); // Creating image layers may have caused some previously visible layers to be covered - self.update_layer_visibility().await; + self.update_layer_visibility().await?; Ok(image_layers) } @@ -4727,6 +4301,12 @@ impl Timeline { return; } + if self.current_logical_size.current_size().is_exact() { + // root timelines are initialized with exact count, but never start the background + // calculation + return; + } + if let Some(await_bg_cancel) = self .current_logical_size .cancel_wait_for_background_loop_concurrency_limit_semaphore @@ -4782,7 +4362,7 @@ impl Timeline { tenant: &crate::tenant::Tenant, prepared: detach_ancestor::PreparedTimelineDetach, ctx: &RequestContext, - ) -> Result, anyhow::Error> { + ) -> Result, anyhow::Error> { detach_ancestor::complete(self, tenant, prepared, ctx).await } @@ -4832,11 +4412,11 @@ impl From for CompactionError { impl From for CompactionError { fn from(value: super::upload_queue::NotInitialized) -> Self { match value { - super::upload_queue::NotInitialized::Uninitialized - | super::upload_queue::NotInitialized::Stopped => { + super::upload_queue::NotInitialized::Uninitialized => { CompactionError::Other(anyhow::anyhow!(value)) } - super::upload_queue::NotInitialized::ShuttingDown => CompactionError::ShuttingDown, + super::upload_queue::NotInitialized::ShuttingDown + | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown, } } } @@ -4861,6 +4441,12 @@ impl CompactionError { } } +impl From for CompactionError { + fn from(_: layer_manager::Shutdown) -> Self { + CompactionError::ShuttingDown + } +} + #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); @@ -4966,11 +4552,14 @@ impl Timeline { .collect(); if !new_images.is_empty() { - guard.track_new_image_layers(new_images, &self.metrics); + guard + .open_mut()? + .track_new_image_layers(new_images, &self.metrics); } - // deletion will happen later, the layer file manager calls garbage_collect_on_drop - guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); + guard + .open_mut()? + .finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); self.remote_client .schedule_compaction_update(&remove_layers, new_deltas)?; @@ -4984,7 +4573,7 @@ impl Timeline { self: &Arc, mut replace_layers: Vec<(Layer, ResidentLayer)>, mut drop_layers: Vec, - ) -> Result<(), super::upload_queue::NotInitialized> { + ) -> Result<(), CompactionError> { let mut guard = self.layers.write().await; // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want @@ -4992,7 +4581,9 @@ impl Timeline { replace_layers.retain(|(l, _)| guard.contains(l)); drop_layers.retain(|l| guard.contains(l)); - guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics); + guard + .open_mut()? + .rewrite_layers(&replace_layers, &drop_layers, &self.metrics); let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); @@ -5281,7 +4872,7 @@ impl Timeline { // // TODO holding a write lock is too agressive and avoidable let mut guard = self.layers.write().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; 'outer: for l in layers.iter_historic_layers() { result.layers_total += 1; @@ -5409,7 +5000,7 @@ impl Timeline { } })?; - guard.finish_gc_timeline(&gc_layers); + guard.open_mut()?.finish_gc_timeline(&gc_layers); #[cfg(feature = "testing")] { @@ -5565,9 +5156,13 @@ impl Timeline { let remaining = { let guard = self.layers.read().await; - guard - .layer_map() - .iter_historic_layers() + let Ok(lm) = guard.layer_map() else { + // technically here we could look into iterating accessible layers, but downloading + // all layers of a shutdown timeline makes no sense regardless. + tracing::info!("attempted to download all layers of shutdown timeline"); + return; + }; + lm.iter_historic_layers() .map(|desc| guard.get_from_desc(&desc)) .collect::>() }; @@ -5674,10 +5269,10 @@ impl Timeline { let file_size = layer.layer_desc().file_size; max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); - let last_activity_ts = layer.access_stats().latest_activity(); + let last_activity_ts = layer.latest_activity(); EvictionCandidate { - layer: layer.into(), + layer: layer.to_owned().into(), last_activity_ts, relative_last_activity: finite_f32::FiniteF32::ZERO, } @@ -5697,6 +5292,22 @@ impl Timeline { } } + /// Persistently blocks gc for `Manual` reason. + /// + /// Returns true if no such block existed before, false otherwise. + pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result { + use crate::tenant::remote_timeline_client::index::GcBlockingReason; + assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id); + tenant.gc_block.insert(self, GcBlockingReason::Manual).await + } + + /// Persistently unblocks gc for `Manual` reason. + pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> { + use crate::tenant::remote_timeline_client::index::GcBlockingReason; + assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id); + tenant.gc_block.remove(self, GcBlockingReason::Manual).await + } + #[cfg(test)] pub(super) fn force_advance_lsn(self: &Arc, new_lsn: Lsn) { self.last_record_lsn.advance(new_lsn); @@ -5746,7 +5357,7 @@ impl Timeline { { let mut guard = self.layers.write().await; - guard.force_insert_layer(image_layer); + guard.open_mut().unwrap().force_insert_layer(image_layer); } Ok(()) @@ -5790,7 +5401,7 @@ impl Timeline { } let guard = self.layers.read().await; - for layer in guard.layer_map().iter_historic_layers() { + for layer in guard.layer_map()?.iter_historic_layers() { if layer.is_delta() && overlaps_with(&layer.lsn_range, &deltas.lsn_range) && layer.lsn_range != deltas.lsn_range @@ -5815,13 +5426,12 @@ impl Timeline { for (key, lsn, val) in deltas.data { delta_layer_writer.put_value(key, lsn, val, ctx).await?; } - let delta_layer = delta_layer_writer - .finish(deltas.key_range.end, self, ctx) - .await?; + let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?; { let mut guard = self.layers.write().await; - guard.force_insert_layer(delta_layer); + guard.open_mut().unwrap().force_insert_layer(delta_layer); } Ok(()) @@ -5836,7 +5446,7 @@ impl Timeline { ) -> anyhow::Result> { let mut all_data = Vec::new(); let guard = self.layers.read().await; - for layer in guard.layer_map().iter_historic_layers() { + for layer in guard.layer_map()?.iter_historic_layers() { if !layer.is_delta() && layer.image_layer_lsn() == lsn { let layer = guard.get_from_desc(&layer); let mut reconstruct_data = ValuesReconstructState::default(); @@ -5864,7 +5474,7 @@ impl Timeline { ) -> anyhow::Result> { let mut layers = Vec::new(); let guard = self.layers.read().await; - for layer in guard.layer_map().iter_historic_layers() { + for layer in guard.layer_map()?.iter_historic_layers() { layers.push(layer.key()); } Ok(layers) @@ -5878,12 +5488,10 @@ impl Timeline { } } -type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId); - /// Tracking writes ingestion does to a particular in-memory layer. /// /// Cleared upon freezing a layer. -struct TimelineWriterState { +pub(crate) struct TimelineWriterState { open_layer: Arc, current_size: u64, // Previous Lsn which passed through @@ -5991,7 +5599,10 @@ impl<'a> TimelineWriter<'a> { } async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> { - let layer = self.tl.get_layer_for_write(at, ctx).await?; + let layer = self + .tl + .get_layer_for_write(at, &self.write_guard, ctx) + .await?; let initial_size = layer.size().await?; let last_freeze_at = self.last_freeze_at.load(); @@ -6004,15 +5615,15 @@ impl<'a> TimelineWriter<'a> { Ok(()) } - async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> { + async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> { let current_size = self.write_guard.as_ref().unwrap().current_size; // self.write_guard will be taken by the freezing self.tl .freeze_inmem_layer_at(freeze_at, &mut self.write_guard) - .await; + .await?; - self.tl.flush_frozen_layers(freeze_at)?; + assert!(self.write_guard.is_none()); if current_size >= self.get_checkpoint_distance() * 2 { warn!("Flushed oversized open layer with size {}", current_size) @@ -6177,6 +5788,7 @@ mod tests { let layers = timeline.layers.read().await; let desc = layers .layer_map() + .unwrap() .iter_historic_layers() .next() .expect("must find one layer to evict"); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 421f718ad6..87ec46c0b5 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -19,8 +19,10 @@ use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; +use pageserver_api::key::KEY_SIZE; use pageserver_api::keyspace::ShardedRange; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; +use serde::Serialize; use tokio_util::sync::CancellationToken; use tracing::{debug, info, info_span, trace, warn, Instrument}; use utils::id::TimelineId; @@ -41,6 +43,7 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use crate::keyspace::KeySpace; use crate::repository::{Key, Value}; +use crate::walrecord::NeonWalRecord; use utils::lsn::Lsn; @@ -73,6 +76,7 @@ impl KeyHistoryRetention { key: Key, delta_writer: &mut Vec<(Key, Lsn, Value)>, mut image_writer: Option<&mut ImageLayerWriter>, + stat: &mut CompactionStatistics, ctx: &RequestContext, ) -> anyhow::Result<()> { let mut first_batch = true; @@ -82,6 +86,7 @@ impl KeyHistoryRetention { let Value::Image(img) = &logs[0].1 else { unreachable!() }; + stat.produce_image_key(img); if let Some(image_writer) = image_writer.as_mut() { image_writer.put_image(key, img.clone(), ctx).await?; } else { @@ -89,24 +94,111 @@ impl KeyHistoryRetention { } } else { for (lsn, val) in logs { + stat.produce_key(&val); delta_writer.push((key, lsn, val)); } } first_batch = false; } else { for (lsn, val) in logs { + stat.produce_key(&val); delta_writer.push((key, lsn, val)); } } } let KeyLogAtLsn(above_horizon_logs) = self.above_horizon; for (lsn, val) in above_horizon_logs { + stat.produce_key(&val); delta_writer.push((key, lsn, val)); } Ok(()) } } +#[derive(Debug, Serialize, Default)] +struct CompactionStatisticsNumSize { + num: u64, + size: u64, +} + +#[derive(Debug, Serialize, Default)] +pub struct CompactionStatistics { + delta_layer_visited: CompactionStatisticsNumSize, + image_layer_visited: CompactionStatisticsNumSize, + delta_layer_produced: CompactionStatisticsNumSize, + image_layer_produced: CompactionStatisticsNumSize, + num_delta_layer_discarded: usize, + num_image_layer_discarded: usize, + num_unique_keys_visited: usize, + wal_keys_visited: CompactionStatisticsNumSize, + image_keys_visited: CompactionStatisticsNumSize, + wal_produced: CompactionStatisticsNumSize, + image_produced: CompactionStatisticsNumSize, +} + +impl CompactionStatistics { + fn estimated_size_of_value(val: &Value) -> usize { + match val { + Value::Image(img) => img.len(), + Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(), + _ => std::mem::size_of::(), + } + } + fn estimated_size_of_key() -> usize { + KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer) + } + fn visit_delta_layer(&mut self, size: u64) { + self.delta_layer_visited.num += 1; + self.delta_layer_visited.size += size; + } + fn visit_image_layer(&mut self, size: u64) { + self.image_layer_visited.num += 1; + self.image_layer_visited.size += size; + } + fn on_unique_key_visited(&mut self) { + self.num_unique_keys_visited += 1; + } + fn visit_wal_key(&mut self, val: &Value) { + self.wal_keys_visited.num += 1; + self.wal_keys_visited.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn visit_image_key(&mut self, val: &Value) { + self.image_keys_visited.num += 1; + self.image_keys_visited.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn produce_key(&mut self, val: &Value) { + match val { + Value::Image(img) => self.produce_image_key(img), + Value::WalRecord(_) => self.produce_wal_key(val), + } + } + fn produce_wal_key(&mut self, val: &Value) { + self.wal_produced.num += 1; + self.wal_produced.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn produce_image_key(&mut self, val: &Bytes) { + self.image_produced.num += 1; + self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64; + } + fn discard_delta_layer(&mut self) { + self.num_delta_layer_discarded += 1; + } + fn discard_image_layer(&mut self) { + self.num_image_layer_discarded += 1; + } + fn produce_delta_layer(&mut self, size: u64) { + self.delta_layer_produced.num += 1; + self.delta_layer_produced.size += size; + } + fn produce_image_layer(&mut self, size: u64) { + self.image_layer_produced.num += 1; + self.image_layer_produced.size += size; + } +} + impl Timeline { /// TODO: cancellation /// @@ -118,12 +210,18 @@ impl Timeline { ctx: &RequestContext, ) -> Result { if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) { - self.compact_with_gc(cancel, ctx) + self.compact_with_gc(cancel, flags, ctx) .await .map_err(CompactionError::Other)?; return Ok(false); } + if flags.contains(CompactFlags::DryRun) { + return Err(CompactionError::Other(anyhow!( + "dry-run mode is not supported for legacy compaction for now" + ))); + } + // High level strategy for compaction / image creation: // // 1. First, calculate the desired "partitioning" of the @@ -273,7 +371,7 @@ impl Timeline { ); let layers = self.layers.read().await; - for layer_desc in layers.layer_map().iter_historic_layers() { + for layer_desc in layers.layer_map()?.iter_historic_layers() { let layer = layers.get_from_desc(&layer_desc); if layer.metadata().shard.shard_count == self.shard_identity.count { // This layer does not belong to a historic ancestor, no need to re-image it. @@ -451,7 +549,9 @@ impl Timeline { /// /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers /// that we know won't be needed for reads. - pub(super) async fn update_layer_visibility(&self) { + pub(super) async fn update_layer_visibility( + &self, + ) -> Result<(), super::layer_manager::Shutdown> { let head_lsn = self.get_last_record_lsn(); // We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas @@ -459,7 +559,7 @@ impl Timeline { // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that // they will be subject to L0->L1 compaction in the near future. let layer_manager = self.layers.read().await; - let layer_map = layer_manager.layer_map(); + let layer_map = layer_manager.layer_map()?; let readable_points = { let children = self.gc_info.read().unwrap().retain_lsns.clone(); @@ -482,6 +582,7 @@ impl Timeline { // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can // avoid assuming that everything at a branch point is visible. drop(covered); + Ok(()) } /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as @@ -535,12 +636,8 @@ impl Timeline { ) -> Result { stats.read_lock_held_spawn_blocking_startup_micros = stats.read_lock_acquisition_micros.till_now(); // set by caller - let layers = guard.layer_map(); - let level0_deltas = layers.get_level0_deltas(); - let mut level0_deltas = level0_deltas - .into_iter() - .map(|x| guard.get_from_desc(&x)) - .collect_vec(); + let layers = guard.layer_map()?; + let level0_deltas = layers.level0_deltas(); stats.level0_deltas_count = Some(level0_deltas.len()); // Only compact if enough layers have accumulated. @@ -553,6 +650,11 @@ impl Timeline { return Ok(CompactLevel0Phase1Result::default()); } + let mut level0_deltas = level0_deltas + .iter() + .map(|x| guard.get_from_desc(x)) + .collect::>(); + // Gather the files to compact in this iteration. // // Start with the oldest Level 0 delta file, and collect any other @@ -1006,14 +1108,16 @@ impl Timeline { || contains_hole { // ... if so, flush previous layer and prepare to write new one - new_layers.push( - writer - .take() - .unwrap() - .finish(prev_key.unwrap().next(), self, ctx) - .await - .map_err(CompactionError::Other)?, - ); + let (desc, path) = writer + .take() + .unwrap() + .finish(prev_key.unwrap().next(), ctx) + .await + .map_err(CompactionError::Other)?; + let new_delta = Layer::finish_creating(self.conf, self, desc, &path) + .map_err(CompactionError::Other)?; + + new_layers.push(new_delta); writer = None; if contains_hole { @@ -1076,12 +1180,13 @@ impl Timeline { prev_key = Some(key); } if let Some(writer) = writer { - new_layers.push( - writer - .finish(prev_key.unwrap().next(), self, ctx) - .await - .map_err(CompactionError::Other)?, - ); + let (desc, path) = writer + .finish(prev_key.unwrap().next(), ctx) + .await + .map_err(CompactionError::Other)?; + let new_delta = Layer::finish_creating(self.conf, self, desc, &path) + .map_err(CompactionError::Other)?; + new_layers.push(new_delta); } // Sync layers @@ -1306,10 +1411,9 @@ impl Timeline { // Find the top of the historical layers let end_lsn = { let guard = self.layers.read().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; - let l0_deltas = layers.get_level0_deltas(); - drop(guard); + let l0_deltas = layers.level0_deltas(); // As an optimization, if we find that there are too few L0 layers, // bail out early. We know that the compaction algorithm would do @@ -1641,6 +1745,7 @@ impl Timeline { pub(crate) async fn compact_with_gc( self: &Arc, cancel: &CancellationToken, + flags: EnumSet, ctx: &RequestContext, ) -> anyhow::Result<()> { use std::collections::BTreeSet; @@ -1664,19 +1769,23 @@ impl Timeline { ) .await?; - info!("running enhanced gc bottom-most compaction"); + let dry_run = flags.contains(CompactFlags::DryRun); + + info!("running enhanced gc bottom-most compaction, dry_run={dry_run}"); scopeguard::defer! { info!("done enhanced gc bottom-most compaction"); }; + let mut stat = CompactionStatistics::default(); + // Step 0: pick all delta layers + image layers below/intersect with the GC horizon. // The layer selection has the following properties: // 1. If a layer is in the selection, all layers below it are in the selection. // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = { let guard = self.layers.read().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; let gc_info = self.gc_info.read().unwrap(); let mut retain_lsns_below_horizon = Vec::new(); let gc_cutoff = gc_info.cutoffs.select_min(); @@ -1740,6 +1849,9 @@ impl Timeline { let key_range = desc.get_key_range(); delta_split_points.insert(key_range.start); delta_split_points.insert(key_range.end); + stat.visit_delta_layer(desc.file_size()); + } else { + stat.visit_image_layer(desc.file_size()); } } let mut delta_layers = Vec::new(); @@ -1775,6 +1887,8 @@ impl Timeline { tline: &Arc, lowest_retain_lsn: Lsn, ctx: &RequestContext, + stats: &mut CompactionStatistics, + dry_run: bool, last_batch: bool, ) -> anyhow::Result> { // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid @@ -1831,6 +1945,7 @@ impl Timeline { let layer_generation = guard.get_from_key(&delta_key).metadata().generation; drop(guard); if layer_generation == tline.generation { + stats.discard_delta_layer(); // TODO: depending on whether we design this compaction process to run along with // other compactions, there could be layer map modifications after we drop the // layer guard, and in case it creates duplicated layer key, we will still error @@ -1857,9 +1972,16 @@ impl Timeline { for (key, lsn, val) in deltas { delta_layer_writer.put_value(key, lsn, val, ctx).await?; } - let delta_layer = delta_layer_writer - .finish(delta_key.key_range.end, tline, ctx) + + stats.produce_delta_layer(delta_layer_writer.size()); + if dry_run { + return Ok(None); + } + + let (desc, path) = delta_layer_writer + .finish(delta_key.key_range.end, ctx) .await?; + let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?; Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer))) } @@ -1951,6 +2073,13 @@ impl Timeline { let mut current_delta_split_point = 0; let mut delta_layers = Vec::new(); while let Some((key, lsn, val)) = merge_iter.next().await? { + if cancel.is_cancelled() { + return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error + } + match val { + Value::Image(_) => stat.visit_image_key(&val), + Value::WalRecord(_) => stat.visit_wal_key(&val), + } if last_key.is_none() || last_key.as_ref() == Some(&key) { if last_key.is_none() { last_key = Some(key); @@ -1958,6 +2087,7 @@ impl Timeline { accumulated_values.push((key, lsn, val)); } else { let last_key = last_key.as_mut().unwrap(); + stat.on_unique_key_visited(); let retention = self .generate_key_retention( *last_key, @@ -1974,6 +2104,7 @@ impl Timeline { *last_key, &mut delta_values, image_layer_writer.as_mut(), + &mut stat, ctx, ) .await?; @@ -1986,6 +2117,8 @@ impl Timeline { self, lowest_retain_lsn, ctx, + &mut stat, + dry_run, false, ) .await?, @@ -1998,6 +2131,7 @@ impl Timeline { let last_key = last_key.expect("no keys produced during compaction"); // TODO: move this part to the loop body + stat.on_unique_key_visited(); let retention = self .generate_key_retention( last_key, @@ -2014,6 +2148,7 @@ impl Timeline { last_key, &mut delta_values, image_layer_writer.as_mut(), + &mut stat, ctx, ) .await?; @@ -2026,6 +2161,8 @@ impl Timeline { self, lowest_retain_lsn, ctx, + &mut stat, + dry_run, true, ) .await?, @@ -2033,12 +2170,28 @@ impl Timeline { assert!(delta_values.is_empty(), "unprocessed keys"); let image_layer = if discard_image_layer { + stat.discard_image_layer(); None } else if let Some(writer) = image_layer_writer { - Some(writer.finish(self, ctx).await?) + stat.produce_image_layer(writer.size()); + if !dry_run { + Some(writer.finish(self, ctx).await?) + } else { + None + } } else { None }; + + info!( + "gc-compaction statistics: {}", + serde_json::to_string(&stat)? + ); + + if dry_run { + return Ok(()); + } + info!( "produced {} delta layers and {} image layers", delta_layers.len(), @@ -2062,10 +2215,13 @@ impl Timeline { let mut layer_selection = layer_selection; layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); compact_to.extend(image_layer); + // Step 3: Place back to the layer map. { let mut guard = self.layers.write().await; - guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics) + guard + .open_mut()? + .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics) }; self.remote_client .schedule_compaction_update(&layer_selection, &compact_to)?; @@ -2145,7 +2301,7 @@ impl CompactionJobExecutor for TimelineAdaptor { self.flush_updates().await?; let guard = self.timeline.layers.read().await; - let layer_map = guard.layer_map(); + let layer_map = guard.layer_map()?; let result = layer_map .iter_historic_layers() @@ -2268,9 +2424,9 @@ impl CompactionJobExecutor for TimelineAdaptor { )) }); - let new_delta_layer = writer - .finish(prev.unwrap().0.next(), &self.timeline, ctx) - .await?; + let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?; + let new_delta_layer = + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; self.new_deltas.push(new_delta_layer); Ok(()) diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 05178c38b4..b03dbb092e 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -230,6 +230,8 @@ impl DeleteTimelineFlow { // Now that the Timeline is in Stopping state, request all the related tasks to shut down. timeline.shutdown(super::ShutdownMode::Hard).await; + tenant.gc_block.before_delete(&timeline); + fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { Err(anyhow::anyhow!( "failpoint: timeline-delete-before-index-deleted-at" diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index ee5f8cd52a..3b52adc77b 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::{collections::HashSet, sync::Arc}; use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; use crate::{ @@ -74,6 +74,11 @@ impl From for Error { Error::ShuttingDown } } +impl From for Error { + fn from(_: super::layer_manager::Shutdown) -> Self { + Error::ShuttingDown + } +} impl From for Error { fn from(value: FlushLayerError) -> Self { @@ -141,50 +146,9 @@ pub(super) async fn prepare( } } - // detached has previously been detached; let's inspect each of the current timelines and - // report back the timelines which have been reparented by our detach - let mut all_direct_children = tenant - .timelines - .lock() - .unwrap() - .values() - .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached))) - .map(|tl| (tl.ancestor_lsn, tl.clone())) - .collect::>(); - - let mut any_shutdown = false; - - all_direct_children.retain( - |(_, tl)| match tl.remote_client.initialized_upload_queue() { - Ok(accessor) => accessor - .latest_uploaded_index_part() - .lineage - .is_reparented(), - Err(_shutdownalike) => { - // not 100% a shutdown, but let's bail early not to give inconsistent results in - // sharded enviroment. - any_shutdown = true; - true - } - }, - ); - - if any_shutdown { - // it could be one or many being deleted; have client retry - return Err(Error::ShuttingDown); - } - - let mut reparented = all_direct_children; - // why this instead of hashset? there is a reason, but I've forgotten it many times. - // - // maybe if this was a hashset we would not be able to distinguish some race condition. - reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id)); - + let reparented_timelines = reparented_direct_children(detached, tenant)?; return Ok(Progress::Done(AncestorDetached { - reparented_timelines: reparented - .into_iter() - .map(|(_, tl)| tl.timeline_id) - .collect(), + reparented_timelines, })); }; @@ -277,7 +241,7 @@ pub(super) async fn prepare( // between retries, these can change if compaction or gc ran in between. this will mean // we have to redo work. - partition_work(ancestor_lsn, &layers) + partition_work(ancestor_lsn, &layers)? }; // TODO: layers are already sorted by something: use that to determine how much of remote @@ -381,16 +345,67 @@ pub(super) async fn prepare( Ok(Progress::Prepared(guard, prepared)) } +fn reparented_direct_children( + detached: &Arc, + tenant: &Tenant, +) -> Result, Error> { + let mut all_direct_children = tenant + .timelines + .lock() + .unwrap() + .values() + .filter_map(|tl| { + let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)); + + if is_direct_child { + Some(tl.clone()) + } else { + if let Some(timeline) = tl.ancestor_timeline.as_ref() { + assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live"); + } + None + } + }) + // Collect to avoid lock taking order problem with Tenant::timelines and + // Timeline::remote_client + .collect::>(); + + let mut any_shutdown = false; + + all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() { + Ok(accessor) => accessor + .latest_uploaded_index_part() + .lineage + .is_reparented(), + Err(_shutdownalike) => { + // not 100% a shutdown, but let's bail early not to give inconsistent results in + // sharded enviroment. + any_shutdown = true; + true + } + }); + + if any_shutdown { + // it could be one or many being deleted; have client retry + return Err(Error::ShuttingDown); + } + + Ok(all_direct_children + .into_iter() + .map(|tl| tl.timeline_id) + .collect()) +} + fn partition_work( ancestor_lsn: Lsn, - source_layermap: &LayerManager, -) -> (usize, Vec, Vec) { + source: &LayerManager, +) -> Result<(usize, Vec, Vec), Error> { let mut straddling_branchpoint = vec![]; let mut rest_of_historic = vec![]; let mut later_by_lsn = 0; - for desc in source_layermap.layer_map().iter_historic_layers() { + for desc in source.layer_map()?.iter_historic_layers() { // off by one chances here: // - start is inclusive // - end is exclusive @@ -409,10 +424,10 @@ fn partition_work( &mut rest_of_historic }; - target.push(source_layermap.get_from_desc(&desc)); + target.push(source.get_from_desc(&desc)); } - (later_by_lsn, straddling_branchpoint, rest_of_historic) + Ok((later_by_lsn, straddling_branchpoint, rest_of_historic)) } async fn upload_rewritten_layer( @@ -488,10 +503,12 @@ async fn copy_lsn_prefix( // reuse the key instead of adding more holes between layers by using the real // highest key in the layer. let reused_highest_key = layer.layer_desc().key_range.end; - let copied = writer - .finish(reused_highest_key, target_timeline, ctx) + let (desc, path) = writer + .finish(reused_highest_key, ctx) .await .map_err(CopyDeltaPrefix)?; + let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path) + .map_err(CopyDeltaPrefix)?; tracing::debug!(%layer, %copied, "new layer produced"); @@ -537,11 +554,12 @@ pub(super) async fn complete( tenant: &Tenant, prepared: PreparedTimelineDetach, _ctx: &RequestContext, -) -> Result, anyhow::Error> { +) -> Result, anyhow::Error> { let PreparedTimelineDetach { layers } = prepared; let ancestor = detached - .get_ancestor_timeline() + .ancestor_timeline + .as_ref() .expect("must still have a ancestor"); let ancestor_lsn = detached.get_ancestor_lsn(); @@ -581,7 +599,7 @@ pub(super) async fn complete( } let tl_ancestor = tl.ancestor_timeline.as_ref()?; - let is_same = Arc::ptr_eq(&ancestor, tl_ancestor); + let is_same = Arc::ptr_eq(ancestor, tl_ancestor); let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; let is_deleting = tl @@ -622,13 +640,18 @@ pub(super) async fn complete( }); let reparenting_candidates = tasks.len(); - let mut reparented = Vec::with_capacity(tasks.len()); + let mut reparented = HashSet::with_capacity(tasks.len()); while let Some(res) = tasks.join_next().await { match res { Ok(Some(timeline)) => { tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); - reparented.push((timeline.ancestor_lsn, timeline.timeline_id)); + + assert!( + reparented.insert(timeline.timeline_id), + "duplicate reparenting? timeline_id={}", + timeline.timeline_id + ); } Ok(None) => { // lets just ignore this for now. one or all reparented timelines could had @@ -650,12 +673,5 @@ pub(super) async fn complete( tracing::info!("failed to reparent some candidates"); } - reparented.sort_unstable(); - - let reparented = reparented - .into_iter() - .map(|(_, timeline_id)| timeline_id) - .collect(); - Ok(reparented) } diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index fec66aabc1..07d860eb80 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -213,51 +213,45 @@ impl Timeline { let mut js = tokio::task::JoinSet::new(); { let guard = self.layers.read().await; - let layers = guard.layer_map(); - for layer in layers.iter_historic_layers() { - let layer = guard.get_from_desc(&layer); - // guard against eviction while we inspect it; it might be that eviction_task and - // disk_usage_eviction_task both select the same layers to be evicted, and - // seemingly free up double the space. both succeeding is of no consequence. + guard + .likely_resident_layers() + .filter(|layer| { + let last_activity_ts = layer.latest_activity(); - if !layer.is_likely_resident() { - continue; - } + let no_activity_for = match now.duration_since(last_activity_ts) { + Ok(d) => d, + Err(_e) => { + // We reach here if `now` < `last_activity_ts`, which can legitimately + // happen if there is an access between us getting `now`, and us getting + // the access stats from the layer. + // + // The other reason why it can happen is system clock skew because + // SystemTime::now() is not monotonic, so, even if there is no access + // to the layer after we get `now` at the beginning of this function, + // it could be that `now` < `last_activity_ts`. + // + // To distinguish the cases, we would need to record `Instant`s in the + // access stats (i.e., monotonic timestamps), but then, the timestamps + // values in the access stats would need to be `Instant`'s, and hence + // they would be meaningless outside of the pageserver process. + // At the time of writing, the trade-off is that access stats are more + // valuable than detecting clock skew. + return false; + } + }; - let last_activity_ts = layer.access_stats().latest_activity(); - - let no_activity_for = match now.duration_since(last_activity_ts) { - Ok(d) => d, - Err(_e) => { - // We reach here if `now` < `last_activity_ts`, which can legitimately - // happen if there is an access between us getting `now`, and us getting - // the access stats from the layer. - // - // The other reason why it can happen is system clock skew because - // SystemTime::now() is not monotonic, so, even if there is no access - // to the layer after we get `now` at the beginning of this function, - // it could be that `now` < `last_activity_ts`. - // - // To distinguish the cases, we would need to record `Instant`s in the - // access stats (i.e., monotonic timestamps), but then, the timestamps - // values in the access stats would need to be `Instant`'s, and hence - // they would be meaningless outside of the pageserver process. - // At the time of writing, the trade-off is that access stats are more - // valuable than detecting clock skew. - continue; - } - }; - - if no_activity_for > p.threshold { + no_activity_for > p.threshold + }) + .cloned() + .for_each(|layer| { js.spawn(async move { layer .evict_and_wait(std::time::Duration::from_secs(5)) .await }); stats.candidates += 1; - } - } + }); }; let join_all = async move { diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 1bc2acbd34..8f20d84401 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,4 +1,4 @@ -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{bail, ensure, Context}; use itertools::Itertools; use pageserver_api::shard::TenantShardId; use std::{collections::HashMap, sync::Arc}; @@ -24,39 +24,142 @@ use crate::{ use super::TimelineWriterState; /// Provides semantic APIs to manipulate the layer map. -#[derive(Default)] -pub(crate) struct LayerManager { - layer_map: LayerMap, - layer_fmgr: LayerFileManager, +pub(crate) enum LayerManager { + /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate + /// the layers. + Open(OpenLayerManager), + /// Shutdown layer manager where there are no more in-memory layers and persistent layers are + /// read-only. + Closed { + layers: HashMap, + }, +} + +impl Default for LayerManager { + fn default() -> Self { + LayerManager::Open(OpenLayerManager::default()) + } } impl LayerManager { - pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer { - self.layer_fmgr.get_from_desc(desc) + pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer { + // The assumption for the `expect()` is that all code maintains the following invariant: + // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. + self.layers() + .get(key) + .with_context(|| format!("get layer from key: {key}")) + .expect("not found") + .clone() } - pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer { - self.layer_fmgr.get_from_key(desc) + pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer { + self.get_from_key(&desc.key()) } /// Get an immutable reference to the layer map. /// /// We expect users only to be able to get an immutable layer map. If users want to make modifications, /// they should use the below semantic APIs. This design makes us step closer to immutable storage state. - pub(crate) fn layer_map(&self) -> &LayerMap { - &self.layer_map + pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> { + use LayerManager::*; + match self { + Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map), + Closed { .. } => Err(Shutdown), + } } + pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> { + use LayerManager::*; + + match self { + Open(open) => Ok(open), + Closed { .. } => Err(Shutdown), + } + } + + /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in + /// order to allow shutdown to complete. + /// + /// If there was a want to flush in-memory layers, it must have happened earlier. + pub(crate) fn shutdown(&mut self, writer_state: &mut Option) { + use LayerManager::*; + match self { + Open(OpenLayerManager { + layer_map, + layer_fmgr: LayerFileManager(hashmap), + }) => { + let open = layer_map.open_layer.take(); + let frozen = layer_map.frozen_layers.len(); + let taken_writer_state = writer_state.take(); + tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers"); + let layers = std::mem::take(hashmap); + *self = Closed { layers }; + assert_eq!(open.is_some(), taken_writer_state.is_some()); + } + Closed { .. } => { + tracing::debug!("ignoring multiple shutdowns on layer manager") + } + } + } + + /// Sum up the historic layer sizes + pub(crate) fn layer_size_sum(&self) -> u64 { + self.layers() + .values() + .map(|l| l.layer_desc().file_size) + .sum() + } + + pub(crate) fn likely_resident_layers(&self) -> impl Iterator + '_ { + self.layers().values().filter(|l| l.is_likely_resident()) + } + + pub(crate) fn contains(&self, layer: &Layer) -> bool { + self.contains_key(&layer.layer_desc().key()) + } + + pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool { + self.layers().contains_key(key) + } + + pub(crate) fn all_persistent_layers(&self) -> Vec { + self.layers().keys().cloned().collect_vec() + } + + fn layers(&self) -> &HashMap { + use LayerManager::*; + match self { + Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0, + Closed { layers } => layers, + } + } +} + +#[derive(Default)] +pub(crate) struct OpenLayerManager { + layer_map: LayerMap, + layer_fmgr: LayerFileManager, +} + +impl std::fmt::Debug for OpenLayerManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("OpenLayerManager") + .field("layer_count", &self.layer_fmgr.0.len()) + .finish() + } +} + +#[derive(Debug, thiserror::Error)] +#[error("layer manager has been shutdown")] +pub(crate) struct Shutdown; + +impl OpenLayerManager { /// Called from `load_layer_map`. Initialize the layer manager with: /// 1. all on-disk layers /// 2. next open layer (with disk disk_consistent_lsn LSN) - pub(crate) fn initialize_local_layers( - &mut self, - on_disk_layers: Vec, - next_open_layer_at: Lsn, - ) { + pub(crate) fn initialize_local_layers(&mut self, layers: Vec, next_open_layer_at: Lsn) { let mut updates = self.layer_map.batch_update(); - for layer in on_disk_layers { + for layer in layers { Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr); } updates.flush(); @@ -68,26 +171,19 @@ impl LayerManager { self.layer_map.next_open_layer_at = Some(next_open_layer_at); } - /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer, - /// called within `get_layer_for_write`. + /// Open a new writable layer to append data if there is no open layer, otherwise return the + /// current open layer, called within `get_layer_for_write`. pub(crate) async fn get_layer_for_write( &mut self, lsn: Lsn, - last_record_lsn: Lsn, conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, + gate_guard: utils::sync::gate::GateGuard, ctx: &RequestContext, - ) -> Result> { + ) -> anyhow::Result> { ensure!(lsn.is_aligned()); - ensure!( - lsn > last_record_lsn, - "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", - lsn, - last_record_lsn, - ); - // Do we have a layer open for writing already? let layer = if let Some(open_layer) = &self.layer_map.open_layer { if open_layer.get_lsn_range().start > lsn { @@ -113,8 +209,15 @@ impl LayerManager { lsn ); - let new_layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?; + let new_layer = InMemoryLayer::create( + conf, + timeline_id, + tenant_shard_id, + start_lsn, + gate_guard, + ctx, + ) + .await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); @@ -168,7 +271,7 @@ impl LayerManager { froze } - /// Add image layers to the layer map, called from `create_image_layers`. + /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`]. pub(crate) fn track_new_image_layers( &mut self, image_layers: &[ResidentLayer], @@ -241,7 +344,7 @@ impl LayerManager { self.finish_compact_l0(compact_from, compact_to, metrics) } - /// Called when compaction is completed. + /// Called post-compaction when some previous generation image layers were trimmed. pub(crate) fn rewrite_layers( &mut self, rewrite_layers: &[(Layer, ResidentLayer)], @@ -259,13 +362,10 @@ impl LayerManager { new_layer.layer_desc().lsn_range ); - // Transfer visibilty hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to + // Transfer visibility hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents // always marking rewritten layers as visible. - new_layer - .as_ref() - .access_stats() - .set_visibility(old_layer.access_stats().visibility()); + new_layer.as_ref().set_visibility(old_layer.visibility()); // Safety: we may never rewrite the same file in-place. Callers are responsible // for ensuring that they only rewrite layers after something changes the path, @@ -333,31 +433,6 @@ impl LayerManager { mapping.remove(layer); layer.delete_on_drop(); } - - pub(crate) fn likely_resident_layers(&self) -> impl Iterator + '_ { - // for small layer maps, we most likely have all resident, but for larger more are likely - // to be evicted assuming lots of layers correlated with longer lifespan. - - self.layer_map().iter_historic_layers().filter_map(|desc| { - self.layer_fmgr - .0 - .get(&desc.key()) - .filter(|l| l.is_likely_resident()) - .cloned() - }) - } - - pub(crate) fn contains(&self, layer: &Layer) -> bool { - self.layer_fmgr.contains(layer) - } - - pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool { - self.layer_fmgr.contains_key(key) - } - - pub(crate) fn all_persistent_layers(&self) -> Vec { - self.layer_fmgr.0.keys().cloned().collect_vec() - } } pub(crate) struct LayerFileManager(HashMap); @@ -369,24 +444,6 @@ impl Default for LayerFileManager { } impl LayerFileManager { - fn get_from_key(&self, key: &PersistentLayerKey) -> T { - // The assumption for the `expect()` is that all code maintains the following invariant: - // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. - self.0 - .get(key) - .with_context(|| format!("get layer from key: {}", key)) - .expect("not found") - .clone() - } - - fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T { - self.get_from_key(&desc.key()) - } - - fn contains_key(&self, key: &PersistentLayerKey) -> bool { - self.0.contains_key(key) - } - pub(crate) fn insert(&mut self, layer: T) { let present = self.0.insert(layer.layer_desc().key(), layer.clone()); if present.is_some() && cfg!(debug_assertions) { @@ -394,10 +451,6 @@ impl LayerFileManager { } } - pub(crate) fn contains(&self, layer: &T) -> bool { - self.0.contains_key(&layer.layer_desc().key()) - } - pub(crate) fn remove(&mut self, layer: &T) { let present = self.0.remove(&layer.layer_desc().key()); if present.is_none() && cfg!(debug_assertions) { diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index b0d6c4a27a..f4a4eea54a 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -122,6 +122,10 @@ impl CurrentLogicalSize { Self::Exact(_) => Accuracy::Exact, } } + + pub(crate) fn is_exact(&self) -> bool { + matches!(self, Self::Exact(_)) + } } impl LogicalSize { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 51b0c420c3..27f6fe90a4 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -30,10 +30,12 @@ use tokio::time::Instant; pub use pageserver_api::models::virtual_file as api; pub(crate) mod io_engine; pub use io_engine::feature_test as io_engine_feature_test; +pub use io_engine::io_engine_for_bench; pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; mod metadata; mod open_options; use self::owned_buffers_io::write::OwnedAsyncWriter; +pub(crate) use api::DirectIoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index 2820cea097..0ffcd9fa05 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -328,3 +328,29 @@ pub fn feature_test() -> anyhow::Result { .join() .unwrap() } + +/// For use in benchmark binaries only. +/// +/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also +/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste +/// developer time trying to figure out why it's slow. +/// +/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic. +pub fn io_engine_for_bench() -> IoEngineKind { + #[cfg(not(target_os = "linux"))] + { + panic!("This benchmark does I/O and can only give a representative result on Linux"); + } + #[cfg(target_os = "linux")] + { + match feature_test().unwrap() { + FeatureTestResult::PlatformPreferred(engine) => engine, + FeatureTestResult::Worse { + engine: _engine, + remark, + } => { + panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}"); + } + } + } +} diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 93252e6b29..de023da5c4 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -45,6 +45,7 @@ static const char *jwt_token = NULL; /* GUCs */ static char *ConsoleURL = NULL; static bool ForwardDDL = true; +static bool RegressTestMode = false; /* * CURL docs say that this buffer must exist until we call curl_easy_cleanup @@ -802,6 +803,14 @@ NeonProcessUtility( case T_DropRoleStmt: HandleDropRole(castNode(DropRoleStmt, parseTree)); break; + case T_CreateTableSpaceStmt: + if (!RegressTestMode) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("CREATE TABLESPACE is not supported on Neon"))); + } + break; default: break; } @@ -864,6 +873,18 @@ InitControlPlaneConnector() NULL, NULL); + DefineCustomBoolVariable( + "neon.regress_test_mode", + "Controls whether we are running in the regression test mode", + NULL, + &RegressTestMode, + false, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); if (!jwt_token) { diff --git a/poetry.lock b/poetry.lock index d7a3dde65b..7db91e51f7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,91 +1,103 @@ # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +[[package]] +name = "aiohappyeyeballs" +version = "2.3.5" +description = "Happy Eyeballs for asyncio" +optional = false +python-versions = ">=3.8" +files = [ + {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"}, + {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"}, +] + [[package]] name = "aiohttp" -version = "3.9.4" +version = "3.10.2" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" files = [ - {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"}, - {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"}, - {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"}, - {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"}, - {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"}, - {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"}, - {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"}, - {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"}, - {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"}, - {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"}, - {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"}, - {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"}, - {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"}, - {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"}, - {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"}, - {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"}, - {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"}, - {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"}, - {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"}, - {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"}, - {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"}, - {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"}, - {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"}, - {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"}, - {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"}, - {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"}, + {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"}, + {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"}, + {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"}, + {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"}, + {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"}, + {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"}, + {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"}, + {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"}, + {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"}, + {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"}, + {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"}, ] [package.dependencies] +aiohappyeyeballs = ">=2.3.0" aiosignal = ">=1.1.2" async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""} attrs = ">=17.3.0" @@ -94,7 +106,7 @@ multidict = ">=4.5,<7.0" yarl = ">=1.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns", "brotlicffi"] +speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] [[package]] name = "aiopg" @@ -1514,6 +1526,20 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "kafka-python" +version = "2.0.2" +description = "Pure Python client for Apache Kafka" +optional = false +python-versions = "*" +files = [ + {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"}, + {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"}, +] + +[package.extras] +crc32c = ["crc32c"] + [[package]] name = "lazy-object-proxy" version = "1.10.0" @@ -3357,4 +3383,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00" +content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 2f18b5fbc6..b316c53034 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true +try-lock.workspace = true typed-json.workspace = true url.workspace = true urlencoding.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 67c4dd019e..90dea01bf3 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -218,7 +218,7 @@ impl RateBucketInfo { impl AuthenticationConfig { pub fn check_rate_limit( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, config: &AuthenticationConfig, secret: AuthSecret, endpoint: &EndpointId, @@ -243,7 +243,7 @@ impl AuthenticationConfig { let limit_not_exceeded = self.rate_limiter.check( ( endpoint_int, - MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet), + MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet), ), password_weight, ); @@ -274,7 +274,7 @@ impl AuthenticationConfig { /// /// All authentication flows will emit an AuthenticationOk message if successful. async fn auth_quirks( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, api: &impl console::Api, user_info: ComputeUserInfoMaybeEndpoint, client: &mut stream::PqStream>, @@ -303,8 +303,8 @@ async fn auth_quirks( let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; // check allowed list - if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr)); + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); } if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { @@ -356,7 +356,7 @@ async fn auth_quirks( } async fn authenticate_with_secret( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, secret: AuthSecret, info: ComputeUserInfo, client: &mut stream::PqStream>, @@ -421,7 +421,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub async fn authenticate( self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, @@ -467,7 +467,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { impl BackendType<'_, ComputeUserInfo, &()> { pub async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result { use BackendType::*; match self { @@ -478,7 +478,7 @@ impl BackendType<'_, ComputeUserInfo, &()> { pub async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { use BackendType::*; match self { @@ -492,7 +492,7 @@ impl BackendType<'_, ComputeUserInfo, &()> { impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> { async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result { use BackendType::*; @@ -514,7 +514,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> { impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> { async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result { use BackendType::*; @@ -571,7 +571,7 @@ mod tests { impl console::Api for Auth { async fn get_role_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &super::ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) @@ -579,7 +579,7 @@ mod tests { async fn get_allowed_ips_and_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &super::ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError> { @@ -591,7 +591,7 @@ mod tests { async fn wake_compute( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &super::ComputeUserInfo, ) -> Result { unimplemented!() @@ -665,7 +665,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -723,7 +723,7 @@ mod tests { )); let _creds = auth_quirks( - &mut ctx, + &ctx, &api, user_info, &mut stream, @@ -742,7 +742,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -775,7 +775,7 @@ mod tests { )); let _creds = auth_quirks( - &mut ctx, + &ctx, &api, user_info, &mut stream, @@ -794,7 +794,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -828,7 +828,7 @@ mod tests { )); let creds = auth_quirks( - &mut ctx, + &ctx, &api, user_info, &mut stream, diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index b98fa63120..285fa29428 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -12,7 +12,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; pub(super) async fn authenticate( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, creds: ComputeUserInfo, client: &mut PqStream>, config: &'static AuthenticationConfig, @@ -27,7 +27,7 @@ pub(super) async fn authenticate( } AuthSecret::Scram(secret) => { info!("auth endpoint chooses SCRAM"); - let scram = auth::Scram(&secret, &mut *ctx); + let scram = auth::Scram(&secret, ctx); let auth_outcome = tokio::time::timeout( config.scram_protocol_timeout, diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 6b0f5e1726..56921dd949 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -18,7 +18,7 @@ use tracing::{info, warn}; /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. pub async fn authenticate_cleartext( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, info: ComputeUserInfo, client: &mut stream::PqStream>, secret: AuthSecret, @@ -28,7 +28,7 @@ pub async fn authenticate_cleartext( ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let ep = EndpointIdInt::from(&info.endpoint); @@ -60,7 +60,7 @@ pub async fn authenticate_cleartext( /// Similar to [`authenticate_cleartext`], but there's a specific password format, /// and passwords are not yet validated (we don't know how to validate them!) pub async fn password_hack_no_authentication( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, ) -> auth::Result { @@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication( ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let payload = AuthFlow::new(client) .begin(auth::PasswordHack) diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 5932e1337c..95f4614736 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String { } pub(super) async fn authenticate( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index d06f5614f1..8f4a392131 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -84,7 +84,7 @@ pub fn endpoint_sni( impl ComputeUserInfoMaybeEndpoint { pub fn parse( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, params: &StartupMessageParams, sni: Option<&str>, common_names: Option<&HashSet>, @@ -249,8 +249,8 @@ mod tests { fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -264,8 +264,8 @@ mod tests { ("database", "world"), // should be ignored ("foo", "bar"), // should be ignored ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -279,9 +279,9 @@ mod tests { let sni = Some("foo.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("foo")); assert_eq!(user_info.options.get_cache_key("foo"), "foo"); @@ -296,8 +296,8 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -311,8 +311,8 @@ mod tests { ("options", "-ckey=1 endpoint=bar -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -329,8 +329,8 @@ mod tests { ), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -344,8 +344,8 @@ mod tests { ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -359,9 +359,9 @@ mod tests { let sni = Some("baz.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("baz")); @@ -374,16 +374,16 @@ mod tests { let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.a.com"); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); Ok(()) @@ -397,10 +397,9 @@ mod tests { let sni = Some("second.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); - let err = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); + let ctx = RequestMonitoring::test(); + let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -417,10 +416,9 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); - let mut ctx = RequestMonitoring::test(); - let err = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); + let ctx = RequestMonitoring::test(); + let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { UnknownCommonName { cn } => { assert_eq!(cn, "localhost"); @@ -438,9 +436,9 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("project")); assert_eq!( user_info.options.get_cache_key("project"), diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 59d1ac17f4..acf7b4f6b6 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -27,7 +27,7 @@ pub trait AuthMethod { pub struct Begin; /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. -pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring); +pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring); impl AuthMethod for Scram<'_> { #[inline(always)] @@ -155,7 +155,7 @@ impl AuthFlow<'_, S, Scram<'_>> { let Scram(secret, ctx) = self.state; // pause the timer while we communicate with the client - let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; @@ -168,10 +168,8 @@ impl AuthFlow<'_, S, Scram<'_>> { } match sasl.method { - SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256), - SCRAM_SHA_256_PLUS => { - ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus) - } + SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256), + SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus), _ => {} } info!("client chooses {}", sasl.method); diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index d7a3eb9a4d..1038fa5116 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -205,7 +205,7 @@ async fn task_main( const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; async fn ssl_handshake( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, raw_stream: S, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -256,13 +256,13 @@ async fn ssl_handshake( } async fn handle_client( - mut ctx: RequestMonitoring, + ctx: RequestMonitoring, dest_suffix: Arc, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { - let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?; + let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?; // Cut off first part of the SNI domain // We receive required destination details in the format of diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index c1fd6dfd80..b44e0ddd2f 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -5,6 +5,7 @@ use aws_config::meta::region::RegionProviderChain; use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; +use aws_config::Region; use futures::future::Either; use proxy::auth; use proxy::auth::backend::AuthRateLimiter; @@ -290,9 +291,10 @@ async fn main() -> anyhow::Result<()> { let config = build_config(&args)?; info!("Authentication backend: {}", config.auth_backend); - info!("Using region: {}", config.aws_region); + info!("Using region: {}", args.aws_region); - let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed + let region_provider = + RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone())); let provider_conf = ProviderConfig::without_region().with_region(region_provider.region().await); let aws_credentials_provider = { @@ -318,7 +320,7 @@ async fn main() -> anyhow::Result<()> { }; let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new( elasticache::AWSIRSAConfig::new( - config.aws_region.clone(), + args.aws_region.clone(), args.redis_cluster_name, args.redis_user_id, ), @@ -376,11 +378,14 @@ async fn main() -> anyhow::Result<()> { let cancel_map = CancelMap::default(); + let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); + RateBucketInfo::validate(redis_rps_limit)?; + let redis_publisher = match ®ional_redis_client { Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( redis_publisher.clone(), args.region.clone(), - &config.redis_rps_limit, + redis_rps_limit, )?))), None => None, }; @@ -656,7 +661,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { )?; let http_config = HttpConfig { - request_timeout: args.sql_over_http.sql_over_http_timeout, pool_options: GlobalConnPoolOptions { max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, @@ -676,9 +680,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, }; - let mut redis_rps_limit = args.redis_rps_limit.clone(); - RateBucketInfo::validate(&mut redis_rps_limit)?; - let config = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, @@ -687,11 +688,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { http_config, authentication_config, require_client_ip: args.require_client_ip, - disable_ip_check_for_http: args.disable_ip_check_for_http, - redis_rps_limit, handshake_timeout: args.handshake_timeout, region: args.region.clone(), - aws_region: args.aws_region.clone(), wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, connect_compute_locks, connect_to_compute_retry_config: config::RetryConfig::parse( diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 4bc10a6020..8c851790c2 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -68,7 +68,7 @@ impl EndpointsCache { ready: AtomicBool::new(false), } } - pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool { + pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool { if !self.ready.load(Ordering::Acquire) { return true; } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index f91693c704..18c82fe379 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -103,8 +103,12 @@ impl ConnCfg { /// Reuse password or auth keys from the other config. pub fn reuse_password(&mut self, other: Self) { - if let Some(password) = other.get_auth() { - self.auth(password); + if let Some(password) = other.get_password() { + self.password(password); + } + + if let Some(keys) = other.get_auth_keys() { + self.auth_keys(keys); } } @@ -120,64 +124,48 @@ impl ConnCfg { /// Apply startup message params to the connection config. pub fn set_startup_params(&mut self, params: &StartupMessageParams) { - let mut client_encoding = false; - for (k, v) in params.iter() { - match k { - "user" => { - // Only set `user` if it's not present in the config. - // Link auth flow takes username from the console's response. - if self.get_user().is_none() { - self.user(v); - } + // Only set `user` if it's not present in the config. + // Link auth flow takes username from the console's response. + if let (None, Some(user)) = (self.get_user(), params.get("user")) { + self.user(user); + } + + // Only set `dbname` if it's not present in the config. + // Link auth flow takes dbname from the console's response. + if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) { + self.dbname(dbname); + } + + // Don't add `options` if they were only used for specifying a project. + // Connection pools don't support `options`, because they affect backend startup. + if let Some(options) = filtered_options(params) { + self.options(&options); + } + + if let Some(app_name) = params.get("application_name") { + self.application_name(app_name); + } + + // TODO: This is especially ugly... + if let Some(replication) = params.get("replication") { + use tokio_postgres::config::ReplicationMode; + match replication { + "true" | "on" | "yes" | "1" => { + self.replication_mode(ReplicationMode::Physical); } "database" => { - // Only set `dbname` if it's not present in the config. - // Link auth flow takes dbname from the console's response. - if self.get_dbname().is_none() { - self.dbname(v); - } - } - "options" => { - // Don't add `options` if they were only used for specifying a project. - // Connection pools don't support `options`, because they affect backend startup. - if let Some(options) = filtered_options(v) { - self.options(&options); - } - } - - // the special ones in tokio-postgres that we don't want being set by the user - "dbname" => {} - "password" => {} - "sslmode" => {} - "host" => {} - "port" => {} - "connect_timeout" => {} - "keepalives" => {} - "keepalives_idle" => {} - "keepalives_interval" => {} - "keepalives_retries" => {} - "target_session_attrs" => {} - "channel_binding" => {} - "max_backend_message_size" => {} - - "client_encoding" => { - client_encoding = true; - // only error should be from bad null bytes, - // but we've already checked for those. - _ = self.param("client_encoding", v); - } - - _ => { - // only error should be from bad null bytes, - // but we've already checked for those. - _ = self.param(k, v); + self.replication_mode(ReplicationMode::Logical); } + _other => {} } } - if !client_encoding { - // for compatibility since we removed it from tokio-postgres - self.param("client_encoding", "UTF8").unwrap(); - } + + // TODO: extend the list of the forwarded startup parameters. + // Currently, tokio-postgres doesn't allow us to pass + // arbitrary parameters, but the ones above are a good start. + // + // This and the reverse params problem can be better addressed + // in a bespoke connection machinery (a new library for that sake). } } @@ -288,12 +276,12 @@ impl ConnCfg { /// Connect to a corresponding compute node. pub async fn connect( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, allow_self_signed_compute: bool, aux: MetricsAuxInfo, timeout: Duration, ) -> Result { - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (socket_addr, stream, host) = self.connect_raw(timeout).await?; drop(pause); @@ -316,14 +304,14 @@ impl ConnCfg { )?; // connect_raw() will not use TLS if sslmode is "disable" - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = self.0.connect_raw(stream, tls).await?; drop(pause); tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); let stream = connection.stream.into_inner(); info!( - cold_start_info = ctx.cold_start_info.as_str(), + cold_start_info = ctx.cold_start_info().as_str(), "connected to compute node at {host} ({socket_addr}) sslmode={:?}", self.0.get_ssl_mode() ); @@ -342,7 +330,7 @@ impl ConnCfg { params, cancel_closure, aux, - _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol), + _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), }; Ok(connection) @@ -350,9 +338,10 @@ impl ConnCfg { } /// Retrieve `options` from a startup message, dropping all proxy-secific flags. -fn filtered_options(options: &str) -> Option { +fn filtered_options(params: &StartupMessageParams) -> Option { #[allow(unstable_name_collisions)] - let options: String = StartupMessageParams::parse_options_raw(options) + let options: String = params + .options_raw()? .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none()) .intersperse(" ") // TODO: use impl from std once it's stabilized .collect(); @@ -424,23 +413,27 @@ mod tests { #[test] fn test_filtered_options() { // Empty options is unlikely to be useful anyway. - assert_eq!(filtered_options(""), None); + let params = StartupMessageParams::new([("options", "")]); + assert_eq!(filtered_options(¶ms), None); // It's likely that clients will only use options to specify endpoint/project. - let params = "project=foo"; - assert_eq!(filtered_options(params), None); + let params = StartupMessageParams::new([("options", "project=foo")]); + assert_eq!(filtered_options(¶ms), None); // Same, because unescaped whitespaces are no-op. - let params = " project=foo "; - assert_eq!(filtered_options(params), None); + let params = StartupMessageParams::new([("options", " project=foo ")]); + assert_eq!(filtered_options(¶ms).as_deref(), None); - let params = r"\ project=foo \ "; - assert_eq!(filtered_options(params).as_deref(), Some(r"\ \ ")); + let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]); + assert_eq!(filtered_options(¶ms).as_deref(), Some(r"\ \ ")); - let params = "project = foo"; - assert_eq!(filtered_options(params).as_deref(), Some("project = foo")); + let params = StartupMessageParams::new([("options", "project = foo")]); + assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo")); - let params = "project = foo neon_endpoint_type:read_write neon_lsn:0/2"; - assert_eq!(filtered_options(params).as_deref(), Some("project = foo")); + let params = StartupMessageParams::new([( + "options", + "project = foo neon_endpoint_type:read_write neon_lsn:0/2", + )]); + assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo")); } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 6504919760..1412095505 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -31,11 +31,8 @@ pub struct ProxyConfig { pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, - pub disable_ip_check_for_http: bool, - pub redis_rps_limit: Vec, pub region: String, pub handshake_timeout: Duration, - pub aws_region: String, pub wake_compute_retry_config: RetryConfig, pub connect_compute_locks: ApiLocks, pub connect_to_compute_retry_config: RetryConfig, @@ -55,7 +52,6 @@ pub struct TlsConfig { } pub struct HttpConfig { - pub request_timeout: tokio::time::Duration, pub pool_options: GlobalConnPoolOptions, pub cancel_set: CancelSet, pub client_conn_threshold: u64, diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 7a9637066f..15fc0134b3 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -292,7 +292,7 @@ pub struct NodeInfo { impl NodeInfo { pub async fn connect( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, timeout: Duration, ) -> Result { self.config @@ -330,20 +330,20 @@ pub(crate) trait Api { /// We still have to mock the scram to avoid leaking information that user doesn't exist. async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result; async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result; } @@ -363,7 +363,7 @@ pub enum ConsoleBackend { impl Api for ConsoleBackend { async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { use ConsoleBackend::*; @@ -378,7 +378,7 @@ impl Api for ConsoleBackend { async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { use ConsoleBackend::*; @@ -393,7 +393,7 @@ impl Api for ConsoleBackend { async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { use ConsoleBackend::*; diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index cfe491f2aa..2093da7562 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -158,7 +158,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached( @@ -168,7 +168,7 @@ impl super::Api for Api { async fn get_allowed_ips_and_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { Ok(( @@ -182,7 +182,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &ComputeUserInfo, ) -> Result { self.do_wake_compute().map_ok(Cached::new_uncached).await diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 768cd2fdfa..7eda238b66 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -57,7 +57,7 @@ impl Api { async fn do_get_auth_info( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { if !self @@ -69,7 +69,7 @@ impl Api { info!("endpoint is not valid, skipping the request"); return Ok(AuthInfo::default()); } - let request_id = ctx.session_id.to_string(); + let request_id = ctx.session_id().to_string(); let application_name = ctx.console_application_name(); async { let request = self @@ -77,7 +77,7 @@ impl Api { .get("proxy_get_role_secret") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id)]) + .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", application_name.as_str()), ("project", user_info.endpoint.as_str()), @@ -87,7 +87,7 @@ impl Api { info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); @@ -130,10 +130,10 @@ impl Api { async fn do_wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let request_id = ctx.session_id.to_string(); + let request_id = ctx.session_id().to_string(); let application_name = ctx.console_application_name(); async { let mut request_builder = self @@ -141,7 +141,7 @@ impl Api { .get("proxy_wake_compute") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id)]) + .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", application_name.as_str()), ("project", user_info.endpoint.as_str()), @@ -156,7 +156,7 @@ impl Api { info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); @@ -192,7 +192,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { let normalized_ep = &user_info.endpoint.normalize(); @@ -226,7 +226,7 @@ impl super::Api for Api { async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { let normalized_ep = &user_info.endpoint.normalize(); @@ -268,7 +268,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { let key = user_info.endpoint_cache_key(); diff --git a/proxy/src/context.rs b/proxy/src/context.rs index ff79ba8275..e925f67233 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -7,13 +7,14 @@ use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; use tracing::{field::display, info, info_span, Span}; +use try_lock::TryLock; use uuid::Uuid; use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, intern::{BranchIdInt, ProjectIdInt}, - metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol}, + metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting}, DbName, EndpointId, RoleName, }; @@ -28,7 +29,15 @@ pub static LOG_CHAN_DISCONNECT: OnceCell> /// /// This data should **not** be used for connection logic, only for observability and limiting purposes. /// All connection logic should instead use strongly typed state machines, not a bunch of Options. -pub struct RequestMonitoring { +pub struct RequestMonitoring( + /// To allow easier use of the ctx object, we have interior mutability. + /// I would typically use a RefCell but that would break the `Send` requirements + /// so we need something with thread-safety. `TryLock` is a cheap alternative + /// that offers similar semantics to a `RefCell` but with synchronisation. + TryLock, +); + +struct RequestMonitoringInner { pub peer_addr: IpAddr, pub session_id: Uuid, pub protocol: Protocol, @@ -85,7 +94,7 @@ impl RequestMonitoring { role = tracing::field::Empty, ); - Self { + let inner = RequestMonitoringInner { peer_addr, session_id, protocol, @@ -110,7 +119,9 @@ impl RequestMonitoring { disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), disconnect_timestamp: None, - } + }; + + Self(TryLock::new(inner)) } #[cfg(test)] @@ -119,48 +130,177 @@ impl RequestMonitoring { } pub fn console_application_name(&self) -> String { + let this = self.0.try_lock().expect("should not deadlock"); format!( "{}/{}", - self.application.as_deref().unwrap_or_default(), - self.protocol + this.application.as_deref().unwrap_or_default(), + this.protocol ) } - pub fn set_rejected(&mut self, rejected: bool) { - self.rejected = Some(rejected); + pub fn set_rejected(&self, rejected: bool) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.rejected = Some(rejected); } - pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { + pub fn set_cold_start_info(&self, info: ColdStartInfo) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_cold_start_info(info); + } + + pub fn set_db_options(&self, options: StartupMessageParams) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.set_application(options.get("application_name").map(SmolStr::from)); + if let Some(user) = options.get("user") { + this.set_user(user.into()); + } + if let Some(dbname) = options.get("database") { + this.set_dbname(dbname.into()); + } + + this.pg_options = Some(options); + } + + pub fn set_project(&self, x: MetricsAuxInfo) { + let mut this = self.0.try_lock().expect("should not deadlock"); + if this.endpoint_id.is_none() { + this.set_endpoint_id(x.endpoint_id.as_str().into()) + } + this.branch = Some(x.branch_id); + this.project = Some(x.project_id); + this.set_cold_start_info(x.cold_start_info); + } + + pub fn set_project_id(&self, project_id: ProjectIdInt) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.project = Some(project_id); + } + + pub fn set_endpoint_id(&self, endpoint_id: EndpointId) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_endpoint_id(endpoint_id); + } + + pub fn set_dbname(&self, dbname: DbName) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_dbname(dbname); + } + + pub fn set_user(&self, user: RoleName) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_user(user); + } + + pub fn set_auth_method(&self, auth_method: AuthMethod) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.auth_method = Some(auth_method); + } + + pub fn has_private_peer_addr(&self) -> bool { + self.0 + .try_lock() + .expect("should not deadlock") + .has_private_peer_addr() + } + + pub fn set_error_kind(&self, kind: ErrorKind) { + let mut this = self.0.try_lock().expect("should not deadlock"); + // Do not record errors from the private address to metrics. + if !this.has_private_peer_addr() { + Metrics::get().proxy.errors_total.inc(kind); + } + if let Some(ep) = &this.endpoint_id { + let metric = &Metrics::get().proxy.endpoints_affected_by_errors; + let label = metric.with_labels(kind); + metric.get_metric(label).measure(ep); + } + this.error_kind = Some(kind); + } + + pub fn set_success(&self) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.success = true; + } + + pub fn log_connect(&self) { + self.0 + .try_lock() + .expect("should not deadlock") + .log_connect(); + } + + pub fn protocol(&self) -> Protocol { + self.0.try_lock().expect("should not deadlock").protocol + } + + pub fn span(&self) -> Span { + self.0.try_lock().expect("should not deadlock").span.clone() + } + + pub fn session_id(&self) -> Uuid { + self.0.try_lock().expect("should not deadlock").session_id + } + + pub fn peer_addr(&self) -> IpAddr { + self.0.try_lock().expect("should not deadlock").peer_addr + } + + pub fn cold_start_info(&self) -> ColdStartInfo { + self.0 + .try_lock() + .expect("should not deadlock") + .cold_start_info + } + + pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause { + LatencyTimerPause { + ctx: self, + start: tokio::time::Instant::now(), + waiting_for, + } + } + + pub fn success(&self) { + self.0 + .try_lock() + .expect("should not deadlock") + .latency_timer + .success() + } +} + +pub struct LatencyTimerPause<'a> { + ctx: &'a RequestMonitoring, + start: tokio::time::Instant, + waiting_for: Waiting, +} + +impl Drop for LatencyTimerPause<'_> { + fn drop(&mut self) { + self.ctx + .0 + .try_lock() + .expect("should not deadlock") + .latency_timer + .unpause(self.start, self.waiting_for); + } +} + +impl RequestMonitoringInner { + fn set_cold_start_info(&mut self, info: ColdStartInfo) { self.cold_start_info = info; self.latency_timer.cold_start_info(info); } - pub fn set_db_options(&mut self, options: StartupMessageParams) { - self.set_application(options.get("application_name").map(SmolStr::from)); - if let Some(user) = options.get("user") { - self.set_user(user.into()); - } - if let Some(dbname) = options.get("database") { - self.set_dbname(dbname.into()); - } - - self.pg_options = Some(options); - } - - pub fn set_project(&mut self, x: MetricsAuxInfo) { - if self.endpoint_id.is_none() { - self.set_endpoint_id(x.endpoint_id.as_str().into()) - } - self.branch = Some(x.branch_id); - self.project = Some(x.project_id); - self.set_cold_start_info(x.cold_start_info); - } - - pub fn set_project_id(&mut self, project_id: ProjectIdInt) { - self.project = Some(project_id); - } - - pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { + fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { if self.endpoint_id.is_none() { self.span.record("ep", display(&endpoint_id)); let metric = &Metrics::get().proxy.connecting_endpoints; @@ -176,44 +316,23 @@ impl RequestMonitoring { } } - pub fn set_dbname(&mut self, dbname: DbName) { + fn set_dbname(&mut self, dbname: DbName) { self.dbname = Some(dbname); } - pub fn set_user(&mut self, user: RoleName) { + fn set_user(&mut self, user: RoleName) { self.span.record("role", display(&user)); self.user = Some(user); } - pub fn set_auth_method(&mut self, auth_method: AuthMethod) { - self.auth_method = Some(auth_method); - } - - pub fn has_private_peer_addr(&self) -> bool { + fn has_private_peer_addr(&self) -> bool { match self.peer_addr { IpAddr::V4(ip) => ip.is_private(), _ => false, } } - pub fn set_error_kind(&mut self, kind: ErrorKind) { - // Do not record errors from the private address to metrics. - if !self.has_private_peer_addr() { - Metrics::get().proxy.errors_total.inc(kind); - } - if let Some(ep) = &self.endpoint_id { - let metric = &Metrics::get().proxy.endpoints_affected_by_errors; - let label = metric.with_labels(kind); - metric.get_metric(label).measure(ep); - } - self.error_kind = Some(kind); - } - - pub fn set_success(&mut self) { - self.success = true; - } - - pub fn log_connect(&mut self) { + fn log_connect(&mut self) { let outcome = if self.success { ConnectOutcome::Success } else { @@ -256,7 +375,7 @@ impl RequestMonitoring { } } -impl Drop for RequestMonitoring { +impl Drop for RequestMonitoringInner { fn drop(&mut self) { if self.sender.is_some() { self.log_connect(); diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 543a458274..bb02a476fc 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -23,7 +23,7 @@ use utils::backoff; use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; -use super::{RequestMonitoring, LOG_CHAN}; +use super::{RequestMonitoringInner, LOG_CHAN}; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { @@ -118,8 +118,8 @@ impl<'a> serde::Serialize for Options<'a> { } } -impl From<&RequestMonitoring> for RequestData { - fn from(value: &RequestMonitoring) -> Self { +impl From<&RequestMonitoringInner> for RequestData { + fn from(value: &RequestMonitoringInner) -> Self { Self { session_id: value.session_id, peer_addr: value.peer_addr.to_string(), diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index db25ac0311..0167553e30 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -370,6 +370,7 @@ pub struct CancellationRequest { pub kind: CancellationOutcome, } +#[derive(Clone, Copy)] pub enum Waiting { Cplane, Client, @@ -398,12 +399,6 @@ pub struct LatencyTimer { outcome: ConnectOutcome, } -pub struct LatencyTimerPause<'a> { - timer: &'a mut LatencyTimer, - start: time::Instant, - waiting_for: Waiting, -} - impl LatencyTimer { pub fn new(protocol: Protocol) -> Self { Self { @@ -417,11 +412,13 @@ impl LatencyTimer { } } - pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> { - LatencyTimerPause { - timer: self, - start: Instant::now(), - waiting_for, + pub fn unpause(&mut self, start: Instant, waiting_for: Waiting) { + let dur = start.elapsed(); + match waiting_for { + Waiting::Cplane => self.accumulated.cplane += dur, + Waiting::Client => self.accumulated.client += dur, + Waiting::Compute => self.accumulated.compute += dur, + Waiting::RetryTimeout => self.accumulated.retry += dur, } } @@ -438,18 +435,6 @@ impl LatencyTimer { } } -impl Drop for LatencyTimerPause<'_> { - fn drop(&mut self) { - let dur = self.start.elapsed(); - match self.waiting_for { - Waiting::Cplane => self.timer.accumulated.cplane += dur, - Waiting::Client => self.timer.accumulated.client += dur, - Waiting::Compute => self.timer.accumulated.compute += dur, - Waiting::RetryTimeout => self.timer.accumulated.retry += dur, - } - } -} - #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] pub enum ConnectOutcome { Success, diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 3edefcf21a..2182f38fe7 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -113,18 +113,18 @@ pub async fn task_main( } }; - let mut ctx = RequestMonitoring::new( + let ctx = RequestMonitoring::new( session_id, peer_addr, crate::metrics::Protocol::Tcp, &config.region, ); - let span = ctx.span.clone(); + let span = ctx.span(); let startup = Box::pin( handle_client( config, - &mut ctx, + &ctx, cancellation_handler, socket, ClientMode::Tcp, @@ -240,7 +240,7 @@ impl ReportableError for ClientRequestError { pub async fn handle_client( config: &'static ProxyConfig, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, cancellation_handler: Arc, stream: S, mode: ClientMode, @@ -248,25 +248,25 @@ pub async fn handle_client( conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { info!( - protocol = %ctx.protocol, + protocol = %ctx.protocol(), "handling interactive connection from client" ); let metrics = &Metrics::get().proxy; - let proto = ctx.protocol; + let proto = ctx.protocol(); let _request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); let record_handshake_error = !ctx.has_private_peer_addr(); - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client); - let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); + let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error); let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { return Ok(cancellation_handler - .cancel_session(cancel_key_data, ctx.session_id) + .cancel_session(cancel_key_data, ctx.session_id()) .await .map(|()| None)?) } diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 82180aaee3..f38e43ba5a 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -46,7 +46,7 @@ pub trait ConnectMechanism { type Error: From; async fn connect_once( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result; @@ -58,7 +58,7 @@ pub trait ConnectMechanism { pub trait ComputeConnectBackend { async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, ) -> Result; fn get_keys(&self) -> Option<&ComputeCredentialKeys>; @@ -81,7 +81,7 @@ impl ConnectMechanism for TcpMechanism<'_> { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] async fn connect_once( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result { @@ -98,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> { /// Try to connect to the compute node, retrying if necessary. #[tracing::instrument(skip_all)] pub async fn connect_to_compute( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, mechanism: &M, user_info: &B, allow_self_signed_compute: bool, @@ -126,7 +126,7 @@ where .await { Ok(res) => { - ctx.latency_timer.success(); + ctx.success(); Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, @@ -178,7 +178,7 @@ where .await { Ok(res) => { - ctx.latency_timer.success(); + ctx.success(); Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, @@ -209,9 +209,7 @@ where let wait_duration = retry_after(num_retries, connect_to_compute_retry_config); num_retries += 1; - let pause = ctx - .latency_timer - .pause(crate::metrics::Waiting::RetryTimeout); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); time::sleep(wait_duration).await; drop(pause); } diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index d488aea927..c65a5558d9 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -10,6 +10,7 @@ use tracing::{info, warn}; use crate::{ auth::endpoint_sni, config::{TlsConfig, PG_ALPN_PROTOCOL}, + context::RequestMonitoring, error::ReportableError, metrics::Metrics, proxy::ERR_INSECURE_CONNECTION, @@ -67,6 +68,7 @@ pub enum HandshakeData { /// we also take an extra care of propagating only the select handshake errors to client. #[tracing::instrument(skip_all)] pub async fn handshake( + ctx: &RequestMonitoring, stream: S, mut tls: Option<&TlsConfig>, record_handshake_error: bool, @@ -80,8 +82,6 @@ pub async fn handshake( let mut stream = PqStream::new(Stream::from_raw(stream)); loop { let msg = stream.read_startup_packet().await?; - info!("received {msg:?}"); - use FeStartupPacket::*; match msg { SslRequest { direct } => match stream.get_ref() { @@ -145,16 +145,20 @@ pub async fn handshake( let conn_info = tls_stream.get_ref().1; + // try parse endpoint + let ep = conn_info + .server_name() + .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten()); + if let Some(ep) = ep { + ctx.set_endpoint_id(ep); + } + // check the ALPN, if exists, as required. match conn_info.alpn_protocol() { None | Some(PG_ALPN_PROTOCOL) => {} Some(other) => { - // try parse ep for better error - let ep = conn_info.server_name().and_then(|sni| { - endpoint_sni(sni, &tls.common_names).ok().flatten() - }); let alpn = String::from_utf8_lossy(other); - warn!(?ep, %alpn, "unexpected ALPN"); + warn!(%alpn, "unexpected ALPN"); return Err(HandshakeError::ProtocolViolation); } } @@ -198,7 +202,12 @@ pub async fn handshake( .await?; } - info!(?version, session_type = "normal", "successful handshake"); + info!( + ?version, + ?params, + session_type = "normal", + "successful handshake" + ); break Ok(HandshakeData::Startup(stream, params)); } // downgrade protocol version diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 5186a9e1b0..d8308c4f2a 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -155,7 +155,7 @@ impl TestAuth for Scram { stream: &mut PqStream>, ) -> anyhow::Result<()> { let outcome = auth::AuthFlow::new(stream) - .begin(auth::Scram(&self.0, &mut RequestMonitoring::test())) + .begin(auth::Scram(&self.0, &RequestMonitoring::test())) .await? .authenticate() .await?; @@ -175,10 +175,11 @@ async fn dummy_proxy( auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let (client, _) = read_proxy_protocol(client).await?; - let mut stream = match handshake(client, tls.as_ref(), false).await? { - HandshakeData::Startup(stream, _) => stream, - HandshakeData::Cancel(_) => bail!("cancellation not supported"), - }; + let mut stream = + match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? { + HandshakeData::Startup(stream, _) => stream, + HandshakeData::Cancel(_) => bail!("cancellation not supported"), + }; auth.authenticate(&mut stream).await?; @@ -457,7 +458,7 @@ impl ConnectMechanism for TestConnectMechanism { async fn connect_once( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _node_info: &console::CachedNodeInfo, _timeout: std::time::Duration, ) -> Result { @@ -565,7 +566,7 @@ fn helper_create_connect_info( async fn connect_to_compute_success() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -573,7 +574,7 @@ async fn connect_to_compute_success() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -583,7 +584,7 @@ async fn connect_to_compute_success() { async fn connect_to_compute_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -591,7 +592,7 @@ async fn connect_to_compute_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -602,7 +603,7 @@ async fn connect_to_compute_retry() { async fn connect_to_compute_non_retry_1() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -610,7 +611,7 @@ async fn connect_to_compute_non_retry_1() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); @@ -621,7 +622,7 @@ async fn connect_to_compute_non_retry_1() { async fn connect_to_compute_non_retry_2() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -629,7 +630,7 @@ async fn connect_to_compute_non_retry_2() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -641,7 +642,7 @@ async fn connect_to_compute_non_retry_3() { let _ = env_logger::try_init(); tokio::time::pause(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]); let user_info = helper_create_connect_info(&mechanism); @@ -656,7 +657,7 @@ async fn connect_to_compute_non_retry_3() { backoff_factor: 2.0, }; connect_to_compute( - &mut ctx, + &ctx, &mechanism, &user_info, false, @@ -673,7 +674,7 @@ async fn connect_to_compute_non_retry_3() { async fn wake_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -681,7 +682,7 @@ async fn wake_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -692,7 +693,7 @@ async fn wake_retry() { async fn wake_non_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -700,7 +701,7 @@ async fn wake_non_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index d96dd0947b..c8ec2b2db6 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -34,9 +34,14 @@ async fn proxy_mitm( tokio::spawn(async move { // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; - let (end_client, startup) = match handshake(client1, Some(&server_config1), false) - .await - .unwrap() + let (end_client, startup) = match handshake( + &RequestMonitoring::test(), + client1, + Some(&server_config1), + false, + ) + .await + .unwrap() { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(_) => panic!("cancellation not supported"), diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index fef349aac0..5b06e8f054 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -14,7 +14,7 @@ use super::connect_compute::ComputeConnectBackend; pub async fn wake_compute( num_retries: &mut u32, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, api: &B, config: RetryConfig, ) -> Result { @@ -52,9 +52,7 @@ pub async fn wake_compute( let wait_duration = retry_after(*num_retries, config); *num_retries += 1; - let pause = ctx - .latency_timer - .pause(crate::metrics::Waiting::RetryTimeout); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); tokio::time::sleep(wait_duration).await; drop(pause); } diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index efa999ed7d..115bef7375 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -334,7 +334,7 @@ async fn request_handler( &config.region, ); - let span = ctx.span.clone(); + let span = ctx.span(); info!(parent: &span, "performing websocket upgrade"); let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request) @@ -367,7 +367,7 @@ async fn request_handler( crate::metrics::Protocol::Http, &config.region, ); - let span = ctx.span.clone(); + let span = ctx.span(); sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 3b86c1838c..295ea1a1c7 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -35,15 +35,15 @@ pub struct PoolingBackend { impl PoolingBackend { pub async fn authenticate( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, config: &AuthenticationConfig, conn_info: &ConnInfo, ) -> Result { let user_info = conn_info.user_info.clone(); let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone()); let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; - if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(AuthError::ip_address_not_allowed(ctx.peer_addr)); + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + return Err(AuthError::ip_address_not_allowed(ctx.peer_addr())); } if !self .endpoint_rate_limiter @@ -100,7 +100,7 @@ impl PoolingBackend { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] pub async fn connect_to_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, conn_info: ConnInfo, keys: ComputeCredentials, force_new: bool, @@ -222,7 +222,7 @@ impl ConnectMechanism for TokioMechanism { async fn connect_once( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { @@ -236,11 +236,7 @@ impl ConnectMechanism for TokioMechanism { .dbname(&self.conn_info.dbname) .connect_timeout(timeout); - config - .param("client_encoding", "UTF8") - .expect("client encoding UTF8 is always valid"); - - let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let res = config.connect(tokio_postgres::NoTls).await; drop(pause); let (client, connection) = permit.release_result(res)?; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index dbc58d48ec..e1dc44dc1c 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -377,7 +377,7 @@ impl GlobalConnPool { pub fn get( self: &Arc, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { let mut client: Option> = None; @@ -409,9 +409,9 @@ impl GlobalConnPool { cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), "pool: reusing connection '{conn_info}'" ); - client.session.send(ctx.session_id)?; + client.session.send(ctx.session_id())?; ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); - ctx.latency_timer.success(); + ctx.success(); return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } } @@ -465,19 +465,19 @@ impl GlobalConnPool { pub fn poll_client( global_pool: Arc>, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, conn_info: ConnInfo, client: C, mut connection: tokio_postgres::Connection, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { - let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol); - let mut session_id = ctx.session_id; + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); + let mut session_id = ctx.session_id(); let (tx, mut rx) = tokio::sync::watch::channel(session_id); let span = info_span!(parent: None, "connection", %conn_id); - let cold_start_info = ctx.cold_start_info; + let cold_start_info = ctx.cold_start_info(); span.in_scope(|| { info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); @@ -766,7 +766,6 @@ mod tests { opt_in: false, max_total_conns: 3, }, - request_timeout: Duration::from_secs(1), cancel_set: CancelSet::new(0), client_conn_threshold: u64::MAX, })); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 6400e4ac7b..e5b6536328 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -144,7 +144,7 @@ impl UserFacingError for ConnInfoError { } fn get_conn_info( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, headers: &HeaderMap, tls: &TlsConfig, ) -> Result { @@ -203,7 +203,6 @@ fn get_conn_info( options = Some(NeonOptions::parse_options_raw(&value)); } } - ctx.set_db_options(params.freeze()); let user_info = ComputeUserInfo { endpoint, @@ -224,12 +223,12 @@ fn get_conn_info( // TODO: return different http error codes pub async fn handle( config: &'static ProxyConfig, - mut ctx: RequestMonitoring, + ctx: RequestMonitoring, request: Request, backend: Arc, cancel: CancellationToken, ) -> Result>, ApiError> { - let result = handle_inner(cancel, config, &mut ctx, request, backend).await; + let result = handle_inner(cancel, config, &ctx, request, backend).await; let mut response = match result { Ok(r) => { @@ -482,13 +481,16 @@ fn map_isolation_level_to_headers(level: IsolationLevel) -> Option async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, request: Request, backend: Arc, ) -> Result>, SqlOverHttpError> { - let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol); + let _requeset_gauge = Metrics::get() + .proxy + .connection_requests + .guard(ctx.protocol()); info!( - protocol = %ctx.protocol, + protocol = %ctx.protocol(), "handling interactive connection from client" ); @@ -544,7 +546,7 @@ async fn handle_inner( .await?; // not strictly necessary to mark success here, // but it's just insurance for if we forget it somewhere else - ctx.latency_timer.success(); + ctx.success(); Ok::<_, HttpConnError>(client) } .map_err(SqlOverHttpError::from), diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 0d5b88f07b..4fba4d141c 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -129,7 +129,7 @@ impl AsyncBufRead for WebSocketRw { pub async fn serve_websocket( config: &'static ProxyConfig, - mut ctx: RequestMonitoring, + ctx: RequestMonitoring, websocket: OnUpgrade, cancellation_handler: Arc, endpoint_rate_limiter: Arc, @@ -145,7 +145,7 @@ pub async fn serve_websocket( let res = Box::pin(handle_client( config, - &mut ctx, + &ctx, cancellation_handler, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, diff --git a/pyproject.toml b/pyproject.toml index 0d5782ac7c..ad3961ef55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,7 @@ [tool.poetry] -name = "neon" -version = "0.1.0" description = "" authors = [] +package-mode = false [tool.poetry.dependencies] python = "^3.9" @@ -33,7 +32,7 @@ psutil = "^5.9.4" types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" -aiohttp = "3.9.4" +aiohttp = "3.10.2" pytest-rerunfailures = "^13.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" @@ -42,6 +41,7 @@ httpx = {extras = ["http2"], version = "^0.26.0"} pytest-repeat = "^0.9.3" websockets = "^12.0" clickhouse-connect = "^0.7.16" +kafka-python = "^2.0.2" [tool.poetry.group.dev.dependencies] mypy = "==1.3.0" @@ -75,6 +75,7 @@ module = [ "allure.*", "allure_commons.*", "allure_pytest.*", + "kafka.*", ] ignore_missing_imports = true diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 3510359591..368b8d300a 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.80.0" +channel = "1.80.1" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 2365fd0587..41c2d3fe08 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -170,11 +170,6 @@ struct Args { /// still needed for existing replication connection. #[arg(long)] walsenders_keep_horizon: bool, - /// Enable partial backup. If disabled, safekeeper will not upload partial - /// segments to remote storage. - /// TODO: now partial backup is always enabled, remove this flag. - #[arg(long)] - partial_backup_enabled: bool, /// Controls how long backup will wait until uploading the partial segment. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)] partial_backup_timeout: Duration, @@ -347,7 +342,6 @@ async fn main() -> anyhow::Result<()> { sk_auth_token, current_thread_runtime: args.current_thread_runtime, walsenders_keep_horizon: args.walsenders_keep_horizon, - partial_backup_enabled: true, partial_backup_timeout: args.partial_backup_timeout, disable_periodic_broker_push: args.disable_periodic_broker_push, enable_offload: args.enable_offload, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 56d61e8287..2e11a279ca 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -93,7 +93,6 @@ pub struct SafeKeeperConf { pub sk_auth_token: Option, pub current_thread_runtime: bool, pub walsenders_keep_horizon: bool, - pub partial_backup_enabled: bool, pub partial_backup_timeout: Duration, pub disable_periodic_broker_push: bool, pub enable_offload: bool, @@ -137,7 +136,6 @@ impl SafeKeeperConf { max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, current_thread_runtime: false, walsenders_keep_horizon: false, - partial_backup_enabled: false, partial_backup_timeout: Duration::from_secs(0), disable_periodic_broker_push: false, enable_offload: false, diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index c224dcd398..482614fac7 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -544,8 +544,8 @@ impl Manager { /// Spawns partial WAL backup task if needed. async fn update_partial_backup(&mut self, state: &StateSnapshot) { - // check if partial backup is enabled and should be started - if !self.conf.is_wal_backup_enabled() || !self.conf.partial_backup_enabled { + // check if WAL backup is enabled and should be started + if !self.conf.is_wal_backup_enabled() { return; } diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 0c6d97ddfa..771d905c90 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -181,7 +181,6 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { sk_auth_token: None, current_thread_runtime: false, walsenders_keep_horizon: false, - partial_backup_enabled: false, partial_backup_timeout: Duration::from_secs(0), disable_periodic_broker_push: false, enable_offload: false, diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 0a4af543ab..15acd0e49c 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -642,8 +642,7 @@ async fn main() -> Result<(), Box> { logging::replace_panic_hook_with_tracing_panic_hook().forget(); // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - info!("version: {GIT_VERSION}"); - info!("build_tag: {BUILD_TAG}"); + info!("version: {GIT_VERSION} build_tag: {BUILD_TAG}"); metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); // On any shutdown signal, log receival and exit. diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs new file mode 100644 index 0000000000..dea1f04649 --- /dev/null +++ b/storage_controller/src/drain_utils.rs @@ -0,0 +1,225 @@ +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, +}; + +use pageserver_api::controller_api::NodeSchedulingPolicy; +use utils::{id::NodeId, shard::TenantShardId}; + +use crate::{ + background_node_operations::OperationError, node::Node, scheduler::Scheduler, + tenant_shard::TenantShard, +}; + +pub(crate) struct TenantShardIterator { + tenants_accessor: F, + inspected_all_shards: bool, + last_inspected_shard: Option, +} + +/// A simple iterator which can be used in tandem with [`crate::service::Service`] +/// to iterate over all known tenant shard ids without holding the lock on the +/// service state at all times. +impl TenantShardIterator +where + F: Fn(Option) -> Option, +{ + pub(crate) fn new(tenants_accessor: F) -> Self { + Self { + tenants_accessor, + inspected_all_shards: false, + last_inspected_shard: None, + } + } + + /// Returns the next tenant shard id if one exists + pub(crate) fn next(&mut self) -> Option { + if self.inspected_all_shards { + return None; + } + + match (self.tenants_accessor)(self.last_inspected_shard) { + Some(tid) => { + self.last_inspected_shard = Some(tid); + Some(tid) + } + None => { + self.inspected_all_shards = true; + None + } + } + } + + /// Returns true when the end of the iterator is reached and false otherwise + pub(crate) fn finished(&self) -> bool { + self.inspected_all_shards + } +} + +/// Check that the state of the node being drained is as expected: +/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`] +pub(crate) fn validate_node_state( + node_id: &NodeId, + nodes: Arc>, +) -> Result<(), OperationError> { + let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged( + format!("node {} was removed", node_id).into(), + ))?; + + let current_policy = node.get_scheduling(); + if !matches!(current_policy, NodeSchedulingPolicy::Draining) { + // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think + // about it + return Err(OperationError::NodeStateChanged( + format!("node {} changed state to {:?}", node_id, current_policy).into(), + )); + } + + Ok(()) +} + +/// Struct that houses a few utility methods for draining pageserver nodes +pub(crate) struct TenantShardDrain { + pub(crate) drained_node: NodeId, + pub(crate) tenant_shard_id: TenantShardId, +} + +impl TenantShardDrain { + /// Check if the tenant shard under question is eligible for drainining: + /// it's primary attachment is on the node being drained + pub(crate) fn tenant_shard_eligible_for_drain( + &self, + tenants: &BTreeMap, + scheduler: &Scheduler, + ) -> Option { + let tenant_shard = tenants.get(&self.tenant_shard_id)?; + + if *tenant_shard.intent.get_attached() != Some(self.drained_node) { + return None; + } + + match scheduler.node_preferred(tenant_shard.intent.get_secondary()) { + Some(node) => Some(node), + None => { + tracing::warn!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "No eligible secondary while draining {}", self.drained_node + ); + + None + } + } + } + + /// Attempt to reschedule the tenant shard under question to one of its secondary locations + /// Returns an Err when the operation should be aborted and Ok(None) when the tenant shard + /// should be skipped. + pub(crate) fn reschedule_to_secondary<'a>( + &self, + destination: NodeId, + tenants: &'a mut BTreeMap, + scheduler: &mut Scheduler, + nodes: &Arc>, + ) -> Result, OperationError> { + let tenant_shard = match tenants.get_mut(&self.tenant_shard_id) { + Some(some) => some, + None => { + // Tenant shard was removed in the meantime. + // Skip to the next one, but don't fail the overall operation + return Ok(None); + } + }; + + if !nodes.contains_key(&destination) { + return Err(OperationError::NodeStateChanged( + format!("node {} was removed", destination).into(), + )); + } + + if !tenant_shard.intent.get_secondary().contains(&destination) { + tracing::info!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Secondary moved away from {destination} during drain" + ); + + return Ok(None); + } + + match tenant_shard.reschedule_to_secondary(Some(destination), scheduler) { + Err(e) => { + tracing::warn!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Scheduling error when draining pageserver {} : {}", self.drained_node, e + ); + + Ok(None) + } + Ok(()) => { + let scheduled_to = tenant_shard.intent.get_attached(); + tracing::info!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Rescheduled shard while draining node {}: {} -> {:?}", + self.drained_node, + self.drained_node, + scheduled_to + ); + + Ok(Some(tenant_shard)) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use utils::{ + id::TenantId, + shard::{ShardCount, ShardNumber, TenantShardId}, + }; + + use super::TenantShardIterator; + + #[test] + fn test_tenant_shard_iterator() { + let tenant_id = TenantId::generate(); + let shard_count = ShardCount(8); + + let mut tenant_shards = Vec::default(); + for i in 0..shard_count.0 { + tenant_shards.push(( + TenantShardId { + tenant_id, + shard_number: ShardNumber(i), + shard_count, + }, + (), + )) + } + + let tenant_shards = Arc::new(tenant_shards); + + let mut tid_iter = TenantShardIterator::new({ + let tenants = tenant_shards.clone(); + move |last_inspected_shard: Option| { + let entry = match last_inspected_shard { + Some(skip_past) => { + let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past); + cursor.nth(1) + } + None => tenants.first(), + }; + + entry.map(|(tid, _)| tid).copied() + } + }); + + let mut iterated_over = Vec::default(); + while let Some(tid) = tid_iter.next() { + iterated_over.push((tid, ())); + } + + assert_eq!(iterated_over, *tenant_shards); + } +} diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs index 8caf638904..26c258c466 100644 --- a/storage_controller/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -4,6 +4,7 @@ use utils::seqwait::MonotonicCounter; mod auth; mod background_node_operations; mod compute_hook; +mod drain_utils; mod heartbeater; pub mod http; mod id_lock_map; diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 2799f21fdc..a66e9128bc 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -92,6 +92,11 @@ struct Cli { /// Chaos testing #[arg(long)] chaos_interval: Option, + + // Maximum acceptable lag for the secondary location while draining + // a pageserver + #[arg(long)] + max_secondary_lag_bytes: Option, } enum StrictMode { @@ -279,6 +284,7 @@ async fn async_main() -> anyhow::Result<()> { .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, + max_secondary_lag_bytes: args.max_secondary_lag_bytes, }; // After loading secrets & config, but before starting anything else, apply database migrations diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 254fdb364e..94db879ade 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -39,6 +39,9 @@ pub(super) struct Reconciler { /// to detach this tenant shard. pub(crate) detach: Vec, + /// Configuration specific to this reconciler + pub(crate) reconciler_config: ReconcilerConfig, + pub(crate) config: TenantConfig, pub(crate) observed: ObservedState, @@ -73,6 +76,65 @@ pub(super) struct Reconciler { pub(crate) persistence: Arc, } +pub(crate) struct ReconcilerConfigBuilder { + config: ReconcilerConfig, +} + +impl ReconcilerConfigBuilder { + pub(crate) fn new() -> Self { + Self { + config: ReconcilerConfig::default(), + } + } + + pub(crate) fn secondary_warmup_timeout(self, value: Duration) -> Self { + Self { + config: ReconcilerConfig { + secondary_warmup_timeout: Some(value), + ..self.config + }, + } + } + + pub(crate) fn secondary_download_request_timeout(self, value: Duration) -> Self { + Self { + config: ReconcilerConfig { + secondary_download_request_timeout: Some(value), + ..self.config + }, + } + } + + pub(crate) fn build(self) -> ReconcilerConfig { + self.config + } +} + +#[derive(Default, Debug, Copy, Clone)] +pub(crate) struct ReconcilerConfig { + // During live migration give up on warming-up the secondary + // after this timeout. + secondary_warmup_timeout: Option, + + // During live migrations this is the amount of time that + // the pagserver will hold our poll. + secondary_download_request_timeout: Option, +} + +impl ReconcilerConfig { + pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration { + const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300); + self.secondary_warmup_timeout + .unwrap_or(SECONDARY_WARMUP_TIMEOUT_DEFAULT) + } + + pub(crate) fn get_secondary_download_request_timeout(&self) -> Duration { + const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT: Duration = Duration::from_secs(20); + self.secondary_download_request_timeout + .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT) + } +} + /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O pub(crate) struct ReconcileUnits { _sem_units: tokio::sync::OwnedSemaphorePermit, @@ -300,11 +362,13 @@ impl Reconciler { ) -> Result<(), ReconcileError> { // This is not the timeout for a request, but the total amount of time we're willing to wait // for a secondary location to get up to date before - const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300); + let total_download_timeout = self.reconciler_config.get_secondary_warmup_timeout(); // This the long-polling interval for the secondary download requests we send to destination pageserver // during a migration. - const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20); + let request_download_timeout = self + .reconciler_config + .get_secondary_download_request_timeout(); let started_at = Instant::now(); @@ -315,14 +379,14 @@ impl Reconciler { client .tenant_secondary_download( tenant_shard_id, - Some(REQUEST_DOWNLOAD_TIMEOUT), + Some(request_download_timeout), ) .await }, &self.service_config.jwt_token, 1, 3, - REQUEST_DOWNLOAD_TIMEOUT * 2, + request_download_timeout * 2, &self.cancel, ) .await @@ -350,7 +414,7 @@ impl Reconciler { return Ok(()); } else if status == StatusCode::ACCEPTED { let total_runtime = started_at.elapsed(); - if total_runtime > TOTAL_DOWNLOAD_TIMEOUT { + if total_runtime > total_download_timeout { tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", total_runtime.as_millis(), progress.layers_downloaded, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 6940bf2c64..31b2d0c3f5 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -14,10 +14,11 @@ use crate::{ Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION, }, compute_hook::NotifyError, + drain_utils::{self, TenantShardDrain, TenantShardIterator}, id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, metrics::LeadershipStatusGroup, persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter}, - reconciler::{ReconcileError, ReconcileUnits}, + reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, scheduler::{MaySchedule, ScheduleContext, ScheduleMode}, tenant_shard::{ MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization, @@ -325,6 +326,12 @@ pub struct Config { // TODO: make this cfg(feature = "testing") pub neon_local_repo_dir: Option, + + // Maximum acceptable download lag for the secondary location + // while draining a node. If the secondary location is lagging + // by more than the configured amount, then the secondary is not + // upgraded to primary. + pub max_secondary_lag_bytes: Option, } impl From for ApiError { @@ -2954,7 +2961,6 @@ impl Service { } // no shard needs to go first/last; the operation should be idempotent - // TODO: it would be great to ensure that all shards return the same error let mut results = self .tenant_for_shards(targets, |tenant_shard_id, node| { futures::FutureExt::boxed(detach_one( @@ -2973,6 +2979,7 @@ impl Service { .filter(|(_, res)| res != &any.1) .collect::>(); if !mismatching.is_empty() { + // this can be hit by races which should not happen because operation lock on cplane let matching = results.len() - mismatching.len(); tracing::error!( matching, @@ -5187,11 +5194,22 @@ impl Service { Ok(()) } - /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], + /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler + /// configuration fn maybe_reconcile_shard( &self, shard: &mut TenantShard, nodes: &Arc>, + ) -> Option { + self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default()) + } + + /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], + fn maybe_configured_reconcile_shard( + &self, + shard: &mut TenantShard, + nodes: &Arc>, + reconciler_config: ReconcilerConfig, ) -> Option { let reconcile_needed = shard.get_reconcile_needed(nodes); @@ -5241,6 +5259,7 @@ impl Service { &self.result_tx, nodes, &self.compute_hook, + reconciler_config, &self.config, &self.persistence, units, @@ -5715,18 +5734,92 @@ impl Service { self.gate.close().await; } + /// Spot check the download lag for a secondary location of a shard. + /// Should be used as a heuristic, since it's not always precise: the + /// secondary might have not downloaded the new heat map yet and, hence, + /// is not aware of the lag. + /// + /// Returns: + /// * Ok(None) if the lag could not be determined from the status, + /// * Ok(Some(_)) if the lag could be determind + /// * Err on failures to query the pageserver. + async fn secondary_lag( + &self, + secondary: &NodeId, + tenant_shard_id: TenantShardId, + ) -> Result, mgmt_api::Error> { + let nodes = self.inner.read().unwrap().nodes.clone(); + let node = nodes.get(secondary).ok_or(mgmt_api::Error::ApiError( + StatusCode::NOT_FOUND, + format!("Node with id {} not found", secondary), + ))?; + + match node + .with_client_retries( + |client| async move { client.tenant_secondary_status(tenant_shard_id).await }, + &self.config.jwt_token, + 1, + 3, + Duration::from_millis(250), + &self.cancel, + ) + .await + { + Some(Ok(status)) => match status.heatmap_mtime { + Some(_) => Ok(Some(status.bytes_total - status.bytes_downloaded)), + None => Ok(None), + }, + Some(Err(e)) => Err(e), + None => Err(mgmt_api::Error::Cancelled), + } + } + /// Drain a node by moving the shards attached to it as primaries. /// This is a long running operation and it should run as a separate Tokio task. pub(crate) async fn drain_node( - &self, + self: &Arc, node_id: NodeId, cancel: CancellationToken, ) -> Result<(), OperationError> { - let mut last_inspected_shard: Option = None; - let mut inspected_all_shards = false; + const MAX_SECONDARY_LAG_BYTES_DEFAULT: u64 = 256 * 1024 * 1024; + let max_secondary_lag_bytes = self + .config + .max_secondary_lag_bytes + .unwrap_or(MAX_SECONDARY_LAG_BYTES_DEFAULT); + + // By default, live migrations are generous about the wait time for getting + // the secondary location up to speed. When draining, give up earlier in order + // to not stall the operation when a cold secondary is encountered. + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); + let reconciler_config = ReconcilerConfigBuilder::new() + .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) + .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) + .build(); + let mut waiters = Vec::new(); - while !inspected_all_shards { + let mut tid_iter = TenantShardIterator::new({ + let service = self.clone(); + move |last_inspected_shard: Option| { + let locked = &service.inner.read().unwrap(); + let tenants = &locked.tenants; + let entry = match last_inspected_shard { + Some(skip_past) => { + // Skip to the last seen tenant shard id + let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past); + + // Skip past the last seen + cursor.nth(1) + } + None => tenants.first_key_value(), + }; + + entry.map(|(tid, _)| tid).copied() + } + }); + + while !tid_iter.finished() { if cancel.is_cancelled() { match self .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) @@ -5745,71 +5838,82 @@ impl Service { } } - { - let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); + drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?; - let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged( - format!("node {node_id} was removed").into(), - ))?; - - let current_policy = node.get_scheduling(); - if !matches!(current_policy, NodeSchedulingPolicy::Draining) { - // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think - // about it - return Err(OperationError::NodeStateChanged( - format!("node {node_id} changed state to {current_policy:?}").into(), - )); - } - - let mut cursor = tenants.iter_mut().skip_while({ - let skip_past = last_inspected_shard; - move |(tid, _)| match skip_past { - Some(last) => **tid != last, - None => false, + while waiters.len() < MAX_RECONCILES_PER_OPERATION { + let tid = match tid_iter.next() { + Some(tid) => tid, + None => { + break; } - }); + }; - while waiters.len() < MAX_RECONCILES_PER_OPERATION { - let (tid, tenant_shard) = match cursor.next() { - Some(some) => some, + let tid_drain = TenantShardDrain { + drained_node: node_id, + tenant_shard_id: tid, + }; + + let dest_node_id = { + let locked = self.inner.read().unwrap(); + + match tid_drain + .tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler) + { + Some(node_id) => node_id, None => { - inspected_all_shards = true; - break; + continue; } - }; + } + }; - // If the shard is not attached to the node being drained, skip it. - if *tenant_shard.intent.get_attached() != Some(node_id) { - last_inspected_shard = Some(*tid); + match self.secondary_lag(&dest_node_id, tid).await { + Ok(Some(lag)) if lag <= max_secondary_lag_bytes => { + // The secondary is reasonably up to date. + // Migrate to it + } + Ok(Some(lag)) => { + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Secondary on node {dest_node_id} is lagging by {lag}. Skipping reconcile." + ); continue; } + Ok(None) => { + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Could not determine lag for secondary on node {dest_node_id}. Skipping reconcile." + ); + continue; + } + Err(err) => { + tracing::warn!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Failed to get secondary lag from node {dest_node_id}. Skipping reconcile: {err}" + ); + continue; + } + } - match tenant_shard.reschedule_to_secondary(None, scheduler) { - Err(e) => { - tracing::warn!( - tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), - "Scheduling error when draining pageserver {} : {e}", node_id - ); - } - Ok(()) => { - let scheduled_to = tenant_shard.intent.get_attached(); - tracing::info!( - tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), - "Rescheduled shard while draining node {}: {} -> {:?}", - node_id, - node_id, - scheduled_to - ); + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + let rescheduled = tid_drain.reschedule_to_secondary( + dest_node_id, + tenants, + scheduler, + nodes, + )?; - let waiter = self.maybe_reconcile_shard(tenant_shard, nodes); - if let Some(some) = waiter { - waiters.push(some); - } + if let Some(tenant_shard) = rescheduled { + let waiter = self.maybe_configured_reconcile_shard( + tenant_shard, + nodes, + reconciler_config, + ); + if let Some(some) = waiter { + waiters.push(some); } } - - last_inspected_shard = Some(*tid); } } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index e250f29f98..1fcc3c8547 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -7,7 +7,7 @@ use std::{ use crate::{ metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, persistence::TenantShardPersistence, - reconciler::ReconcileUnits, + reconciler::{ReconcileUnits, ReconcilerConfig}, scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext}, service::ReconcileResultRequest, }; @@ -1063,6 +1063,7 @@ impl TenantShard { result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, + reconciler_config: ReconcilerConfig, service_config: &service::Config, persistence: &Arc, units: ReconcileUnits, @@ -1101,6 +1102,7 @@ impl TenantShard { generation: self.generation, intent: reconciler_intent, detach, + reconciler_config, config: self.config.clone(), observed: self.observed.clone(), compute_hook: compute_hook.clone(), diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 5aa9e88c40..35ec69fd50 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -92,7 +92,7 @@ pub(crate) async fn branch_cleanup_and_check_errors( .push(format!("index_part.json version: {}", index_part.version())) } - let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2); + let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3); if !newest_versions.any(|ip| ip == &index_part.version()) { info!( "index_part.json version is not latest: {}", @@ -172,8 +172,11 @@ pub(crate) async fn branch_cleanup_and_check_errors( } } BlobDataParseResult::Relic => {} - BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( - parse_errors + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => result.errors.extend( + errors .into_iter() .map(|error| format!("parse error: {error}")), ), @@ -300,7 +303,10 @@ pub(crate) enum BlobDataParseResult { }, /// The remains of a deleted Timeline (i.e. an initdb archive only) Relic, - Incorrect(Vec), + Incorrect { + errors: Vec, + s3_layers: HashSet<(LayerName, Generation)>, + }, } pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { @@ -443,7 +449,7 @@ pub(crate) async fn list_timeline_blobs( } Ok(S3TimelineBlobData { - blob_data: BlobDataParseResult::Incorrect(errors), + blob_data: BlobDataParseResult::Incorrect { errors, s3_layers }, unused_index_keys: index_part_keys, unknown_keys, }) diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index a111c31844..cbc836755a 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -208,21 +208,21 @@ async fn main() -> anyhow::Result<()> { } if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) + tracing::error!("Fatal scrub errors detected"); } else if summary.is_empty() { // Strictly speaking an empty bucket is a valid bucket, but if someone ran the // scrubber they were likely expecting to scan something, and if we see no timelines // at all then it's likely due to some configuration issues like a bad prefix - Err(anyhow::anyhow!( + tracing::error!( "No timelines found in bucket {} prefix {}", bucket_config.bucket, bucket_config .prefix_in_bucket .unwrap_or("".to_string()) - )) - } else { - Ok(()) + ); } + + Ok(()) } } } diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs index c702c0c312..54812ffc94 100644 --- a/storage_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -4,7 +4,7 @@ use anyhow::{anyhow, Context}; use async_stream::{stream, try_stream}; use aws_sdk_s3::{types::ObjectIdentifier, Client}; use futures::StreamExt; -use remote_storage::{GenericRemoteStorage, ListingMode}; +use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; use tokio_stream::Stream; use crate::{ @@ -276,3 +276,33 @@ pub(crate) fn stream_listing<'a>( } } } + +pub(crate) fn stream_listing_generic<'a>( + remote_client: &'a GenericRemoteStorage, + target: &'a S3Target, +) -> impl Stream)>> + 'a { + let listing_mode = if target.delimiter.is_empty() { + ListingMode::NoDelimiter + } else { + ListingMode::WithDelimiter + }; + try_stream! { + let mut objects_stream = std::pin::pin!(stream_objects_with_retries( + remote_client, + listing_mode, + target, + )); + while let Some(list) = objects_stream.next().await { + let list = list?; + if target.delimiter.is_empty() { + for key in list.keys { + yield (key.key.clone(), Some(key)); + } + } else { + for key in list.prefixes { + yield (key, None); + } + } + } + } +} diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 69896caa82..ff230feae3 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -389,10 +389,13 @@ async fn gc_ancestor( // Post-deletion tenant location: don't try and GC it. continue; } - BlobDataParseResult::Incorrect(reasons) => { + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, // TODO(yuchen): could still check references to these s3 layers? + } => { // Our primary purpose isn't to report on bad data, but log this rather than skipping silently tracing::warn!( - "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}" + "Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}" ); continue; } @@ -518,9 +521,12 @@ pub async fn pageserver_physical_gc( // Post-deletion tenant location: don't try and GC it. return Ok(summary); } - BlobDataParseResult::Incorrect(reasons) => { + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => { // Our primary purpose isn't to report on bad data, but log this rather than skipping silently - tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}"); + tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}"); return Ok(summary); } }; diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index dc410bde41..b9630056e1 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -290,13 +290,21 @@ pub async fn scan_metadata( } } - if let BlobDataParseResult::Parsed { - index_part: _index_part, - index_part_generation: _index_part_generation, - s3_layers, - } = &data.blob_data - { - tenant_objects.push(ttid, s3_layers.clone()); + match &data.blob_data { + BlobDataParseResult::Parsed { + index_part: _index_part, + index_part_generation: _index_part_generation, + s3_layers, + } => { + tenant_objects.push(ttid, s3_layers.clone()); + } + BlobDataParseResult::Relic => (), + BlobDataParseResult::Incorrect { + errors: _, + s3_layers, + } => { + tenant_objects.push(ttid, s3_layers.clone()); + } } tenant_timeline_results.push((ttid, data)); } diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 553adf8f46..08a4541c5c 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -1,10 +1,10 @@ use std::{collections::HashSet, str::FromStr, sync::Arc}; -use aws_sdk_s3::Client; use futures::stream::{StreamExt, TryStreamExt}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; use postgres_ffi::{XLogFileName, PG_TLI}; +use remote_storage::GenericRemoteStorage; use serde::Serialize; use tokio_postgres::types::PgLsn; use tracing::{error, info, trace}; @@ -14,8 +14,9 @@ use utils::{ }; use crate::{ - cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, + cloud_admin_api::CloudAdminApiClient, init_remote_generic, + metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget, + TenantShardTimelineId, }; /// Generally we should ask safekeepers, but so far we use everywhere default 16MB. @@ -106,7 +107,7 @@ pub async fn scan_safekeeper_metadata( let timelines = client.query(&query, &[]).await?; info!("loaded {} timelines", timelines.len()); - let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?; + let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?; let console_config = ConsoleConfig::from_env()?; let cloud_admin_api_client = CloudAdminApiClient::new(console_config); @@ -119,7 +120,7 @@ pub async fn scan_safekeeper_metadata( let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg)); let ttid = TenantTimelineId::new(tenant_id, timeline_id); check_timeline( - &s3_client, + &remote_client, &target, &cloud_admin_api_client, ttid, @@ -156,7 +157,7 @@ struct TimelineCheckResult { /// errors are logged to stderr; returns Ok(true) if timeline is consistent, /// Ok(false) if not, Err if failed to check. async fn check_timeline( - s3_client: &Client, + remote_client: &GenericRemoteStorage, root: &RootTarget, api_client: &CloudAdminApiClient, ttid: TenantTimelineId, @@ -187,12 +188,13 @@ async fn check_timeline( // we need files, so unset it. timeline_dir_target.delimiter = String::new(); - let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); + let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target)); while let Some(obj) = stream.next().await { - let obj = obj?; - let key = obj.key(); + let (key, _obj) = obj?; let seg_name = key + .get_path() + .as_str() .strip_prefix(&timeline_dir_target.prefix_in_bucket) .expect("failed to extract segment name"); expected_segfiles.remove(seg_name); diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 5a75f8d40e..1866e6ec80 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -269,7 +269,7 @@ impl SnapshotDownloader { .context("Downloading timeline")?; } BlobDataParseResult::Relic => {} - BlobDataParseResult::Incorrect(_) => { + BlobDataParseResult::Incorrect { .. } => { tracing::error!("Bad metadata in timeline {ttid}"); } }; diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 658ed119a1..0636cfad06 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -285,9 +285,9 @@ class NeonApiEndpoint: self.project_id = project_id eps = neon_api.get_endpoints(project_id)["endpoints"] self.endpoint_id = eps[0]["id"] - self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[ - "uri" - ] + self.connstr = neon_api.get_connection_uri( + project_id, endpoint_id=self.endpoint_id, pooled=False + )["uri"] pw = self.connstr.split("@")[0].split(":")[-1] self.pgbench_env = { "PGHOST": eps[0]["host"], diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7289472de2..844a23d327 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,6 +14,7 @@ import textwrap import threading import time import uuid +from collections import defaultdict from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime @@ -978,7 +979,10 @@ class NeonEnvBuilder: and self.enable_scrub_on_exit ): try: - self.env.storage_scrubber.scan_metadata() + healthy, _ = self.env.storage_scrubber.scan_metadata() + if not healthy: + e = Exception("Remote storage metadata corrupted") + cleanup_error = e except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -2664,6 +2668,69 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"Got failpoints request response code {res.status_code}") res.raise_for_status() + def get_tenants_placement(self) -> defaultdict[str, Dict[str, Any]]: + """ + Get the intent and observed placements of all tenants known to the storage controller. + """ + tenants = self.tenant_list() + + tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict( + lambda: { + "observed": {"attached": None, "secondary": []}, + "intent": {"attached": None, "secondary": []}, + } + ) + + for t in tenants: + for node_id, loc_state in t["observed"]["locations"].items(): + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] + in set(["AttachedSingle", "AttachedMulti", "AttachedStale"]) + ): + tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id) + + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] == "Secondary" + ): + tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append( + int(node_id) + ) + + if "attached" in t["intent"]: + tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"][ + "attached" + ] + + if "secondary" in t["intent"]: + tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][ + "secondary" + ] + + return tenant_placement + + def warm_up_all_secondaries(self): + log.info("Warming up all secondary locations") + + tenant_placement = self.get_tenants_placement() + for tid, placement in tenant_placement.items(): + assert placement["observed"]["attached"] is not None + primary_id = placement["observed"]["attached"] + + assert len(placement["observed"]["secondary"]) == 1 + secondary_id = placement["observed"]["secondary"][0] + + parsed_tid = TenantShardId.parse(tid) + self.env.get_pageserver(primary_id).http_client().tenant_heatmap_upload(parsed_tid) + self.env.get_pageserver(secondary_id).http_client().tenant_secondary_download( + parsed_tid, wait_ms=250 + ) + @property def workdir(self) -> Path: return self.env.repo_dir @@ -4411,14 +4478,19 @@ class StorageScrubber: assert stdout is not None return stdout - def scan_metadata(self, post_to_storage_controller: bool = False) -> Any: + def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]: + """ + Returns the health status and the metadata summary. + """ args = ["scan-metadata", "--node-kind", "pageserver", "--json"] if post_to_storage_controller: args.append("--post") stdout = self.scrubber_cli(args, timeout=30) try: - return json.loads(stdout) + summary = json.loads(stdout) + healthy = not summary["with_errors"] and not summary["with_warnings"] + return healthy, summary except: log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:") log.error(stdout) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 192324f086..cd4261f1b8 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -61,6 +61,7 @@ class HistoricLayerInfo: remote: bool # None for image layers, true if pageserver thinks this is an L0 delta layer l0: Optional[bool] + visible: bool @classmethod def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: @@ -79,6 +80,7 @@ class HistoricLayerInfo: lsn_end=d.get("lsn_end"), remote=d["remote"], l0=l0_ness, + visible=d["access_stats"]["visible"], ) @@ -359,6 +361,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) return (res.status_code, res.json()) + def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]): + url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status" + res = self.get(url) + self.verbose_error(res) + return res.json() + def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]): assert "tenant_id" not in config.keys() res = self.put( @@ -556,6 +564,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json + def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc", + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + + def timeline_unblock_gc( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc", + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + def timeline_compact( self, tenant_id: Union[TenantId, TenantShardId], @@ -839,7 +863,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, batch_size: int | None = None, **kwargs, - ) -> List[TimelineId]: + ) -> Set[TimelineId]: params = {} if batch_size is not None: params["batch_size"] = batch_size @@ -850,7 +874,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): ) self.verbose_error(res) json = res.json() - return list(map(TimelineId, json["reparented_timelines"])) + return set(map(TimelineId, json["reparented_timelines"])) def evict_layer( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 0f2a997b1e..1b6c3c23ba 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -177,9 +177,14 @@ class S3Storage: def access_env_vars(self) -> Dict[str, str]: if self.aws_profile is not None: - return { + env = { "AWS_PROFILE": self.aws_profile, } + # Pass through HOME env var because AWS_PROFILE needs it in order to work + home = os.getenv("HOME") + if home is not None: + env["HOME"] = home + return env if self.access_key is not None and self.secret_key is not None: return { "AWS_ACCESS_KEY_ID": self.access_key, diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 7f54eb0b0a..4dc9f7caae 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -389,7 +389,10 @@ WaitUntilRet = TypeVar("WaitUntilRet") def wait_until( - number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet] + number_of_iterations: int, + interval: float, + func: Callable[[], WaitUntilRet], + show_intermediate_error=False, ) -> WaitUntilRet: """ Wait until 'func' returns successfully, without exception. Returns the @@ -402,6 +405,8 @@ def wait_until( except Exception as e: log.info("waiting for %s iteration %s failed", func, i + 1) last_exception = e + if show_intermediate_error: + log.info(e) time.sleep(interval) continue return res diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md new file mode 100644 index 0000000000..8eca056dda --- /dev/null +++ b/test_runner/logical_repl/README.md @@ -0,0 +1,22 @@ +# Logical replication tests + +## Clickhouse + +```bash +export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb + +docker compose -f clickhouse/docker-compose.yml up -d +pytest -m remote_cluster -k test_clickhouse +docker compose -f clickhouse/docker-compose.yml down +``` + +## Debezium + +```bash +export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb + +docker compose -f debezium/docker-compose.yml up -d +pytest -m remote_cluster -k test_debezium +docker compose -f debezium/docker-compose.yml down + +``` \ No newline at end of file diff --git a/test_runner/logical_repl/clickhouse/docker-compose.yml b/test_runner/logical_repl/clickhouse/docker-compose.yml new file mode 100644 index 0000000000..e00038b811 --- /dev/null +++ b/test_runner/logical_repl/clickhouse/docker-compose.yml @@ -0,0 +1,9 @@ +services: + clickhouse: + image: clickhouse/clickhouse-server + user: "101:101" + container_name: clickhouse + hostname: clickhouse + ports: + - 127.0.0.1:8123:8123 + - 127.0.0.1:9000:9000 diff --git a/test_runner/logical_repl/debezium/docker-compose.yml b/test_runner/logical_repl/debezium/docker-compose.yml new file mode 100644 index 0000000000..fee127a2fd --- /dev/null +++ b/test_runner/logical_repl/debezium/docker-compose.yml @@ -0,0 +1,24 @@ +services: + zookeeper: + image: quay.io/debezium/zookeeper:2.7 + kafka: + image: quay.io/debezium/kafka:2.7 + environment: + ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + ports: + - 127.0.0.1:9092:9092 + debezium: + image: quay.io/debezium/connect:2.7 + environment: + BOOTSTRAP_SERVERS: kafka:9092 + GROUP_ID: 1 + CONFIG_STORAGE_TOPIC: debezium-config + OFFSET_STORAGE_TOPIC: debezium-offset + STATUS_STORAGE_TOPIC: debezium-status + DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector + ports: + - 127.0.0.1:8083:8083 diff --git a/test_runner/logical_repl/test_log_repl.py b/test_runner/logical_repl/test_clickhouse.py similarity index 85% rename from test_runner/logical_repl/test_log_repl.py rename to test_runner/logical_repl/test_clickhouse.py index 0a1aecfe2b..c5ed9bc8af 100644 --- a/test_runner/logical_repl/test_log_repl.py +++ b/test_runner/logical_repl/test_clickhouse.py @@ -1,8 +1,9 @@ """ -Test the logical replication in Neon with the different consumers +Test the logical replication in Neon with ClickHouse as a consumer """ import hashlib +import os import time import clickhouse_connect @@ -39,22 +40,15 @@ def test_clickhouse(remote_pg: RemotePostgres): """ Test the logical replication having ClickHouse as a client """ + clickhouse_host = "clickhouse" if ("CI" in os.environ) else "127.0.0.1" conn_options = remote_pg.conn_options() - for _ in range(5): - try: - conn = psycopg2.connect(remote_pg.connstr()) - except psycopg2.OperationalError as perr: - log.debug(perr) - time.sleep(1) - else: - break - raise TimeoutError + conn = psycopg2.connect(remote_pg.connstr()) cur = conn.cursor() cur.execute("DROP TABLE IF EXISTS table1") cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));") cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');") conn.commit() - client = clickhouse_connect.get_client(host="clickhouse") + client = clickhouse_connect.get_client(host=clickhouse_host) client.command("SET allow_experimental_database_materialized_postgresql=1") client.command( "CREATE DATABASE db1_postgres ENGINE = " diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py new file mode 100644 index 0000000000..5426a06ca1 --- /dev/null +++ b/test_runner/logical_repl/test_debezium.py @@ -0,0 +1,190 @@ +""" +Test the logical replication in Neon with Debezium as a consumer +""" + +import json +import os +import time + +import psycopg2 +import pytest +import requests +from fixtures.log_helper import log +from fixtures.neon_fixtures import RemotePostgres +from fixtures.utils import wait_until + + +class DebeziumAPI: + """ + The class for Debezium API calls + """ + + def __init__(self): + self.__host = "debezium" if ("CI" in os.environ) else "127.0.0.1" + self.__base_url = f"http://{self.__host}:8083" + self.__connectors_url = f"{self.__base_url}/connectors" + + def __request(self, method, addurl="", **kwargs): + return requests.request( + method, + self.__connectors_url + addurl, + headers={"Accept": "application/json", "Content-type": "application/json"}, + timeout=60, + **kwargs, + ) + + def create_pg_connector(self, remote_pg: RemotePostgres, dbz_conn_name: str): + """ + Create a Postgres connector in debezium + """ + conn_options = remote_pg.conn_options() + payload = { + "name": dbz_conn_name, + "config": { + "connector.class": "io.debezium.connector.postgresql.PostgresConnector", + "tasks.max": "1", + "database.hostname": conn_options["host"], + "database.port": "5432", + "database.user": conn_options["user"], + "database.password": conn_options["password"], + "database.dbname": conn_options["dbname"], + "plugin.name": "pgoutput", + "topic.prefix": "dbserver1", + "schema.include.list": "inventory", + }, + } + return self.__request("POST", json=payload) + + def list_connectors(self): + """ + Returns a list of all connectors existent in Debezium. + """ + resp = self.__request("GET") + assert resp.ok + return json.loads(resp.text) + + def del_connector(self, connector): + """ + Deletes the specified connector + """ + return self.__request("DELETE", f"/{connector}") + + +@pytest.fixture(scope="function") +def debezium(remote_pg: RemotePostgres): + """ + Prepare the Debezium API handler, connection + """ + conn = psycopg2.connect(remote_pg.connstr()) + cur = conn.cursor() + cur.execute("DROP SCHEMA IF EXISTS inventory CASCADE") + cur.execute("CREATE SCHEMA inventory") + cur.execute( + "CREATE TABLE inventory.customers (" + "id SERIAL NOT NULL PRIMARY KEY," + "first_name character varying(255) NOT NULL," + "last_name character varying(255) NOT NULL," + "email character varying(255) NOT NULL)" + ) + conn.commit() + dbz = DebeziumAPI() + assert len(dbz.list_connectors()) == 0 + dbz_conn_name = "inventory-connector" + resp = dbz.create_pg_connector(remote_pg, dbz_conn_name) + log.debug("%s %s %s", resp.status_code, resp.ok, resp.text) + assert resp.status_code == 201 + assert len(dbz.list_connectors()) == 1 + from kafka import KafkaConsumer + + consumer = KafkaConsumer( + "dbserver1.inventory.customers", + bootstrap_servers=["kafka:9092"], + auto_offset_reset="earliest", + enable_auto_commit=False, + ) + yield conn, consumer + resp = dbz.del_connector(dbz_conn_name) + assert resp.status_code == 204 + + +def get_kafka_msg(consumer, ts_ms, before=None, after=None) -> None: + """ + Gets the message from Kafka and checks its validity + Arguments: + consumer: the consumer object + ts_ms: timestamp in milliseconds of the change of db, the corresponding message must have + the later timestamp + before: a dictionary, if not None, the before field from the kafka message must + have the same values for the same keys + after: a dictionary, if not None, the after field from the kafka message must + have the same values for the same keys + """ + msg = consumer.poll() + assert msg, "Empty message" + for val in msg.values(): + r = json.loads(val[-1].value) + log.info(r["payload"]) + assert ts_ms < r["payload"]["ts_ms"], "Incorrect timestamp" + for param, pname in ((before, "before"), (after, "after")): + if param is not None: + for k, v in param.items(): + assert r["payload"][pname][k] == v, f"{pname} mismatches" + + +@pytest.mark.remote_cluster +def test_debezium(debezium): + """ + Test the logical replication having Debezium as a subscriber + """ + conn, consumer = debezium + cur = conn.cursor() + ts_ms = time.time() * 1000 + log.info("Insert 1 ts_ms: %s", ts_ms) + cur.execute( + "insert into inventory.customers (first_name, last_name, email) " + "values ('John', 'Dow','johndow@example.com')" + ) + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"}, + ), + show_intermediate_error=True, + ) + ts_ms = time.time() * 1000 + log.info("Insert 2 ts_ms: %s", ts_ms) + cur.execute( + "insert into inventory.customers (first_name, last_name, email) " + "values ('Alex', 'Row','alexrow@example.com')" + ) + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"}, + ), + show_intermediate_error=True, + ) + ts_ms = time.time() * 1000 + log.info("Update ts_ms: %s", ts_ms) + cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2") + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "Alexander"}, + ), + show_intermediate_error=True, + ) + time.sleep(3) + cur.execute("select 1") diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 53bb29a659..4b4ffc1fee 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -100,24 +100,32 @@ def test_subscriber_lag( pub_connstr = benchmark_project_pub.connstr sub_connstr = benchmark_project_sub.connstr - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + if benchmark_project_pub.is_new: + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + if benchmark_project_sub.is_new: + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) pub_conn = psycopg2.connect(pub_connstr) sub_conn = psycopg2.connect(sub_connstr) pub_conn.autocommit = True sub_conn.autocommit = True with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - if benchmark_project_pub.is_new: - pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history") + pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'") + pub_exists = len(pub_cur.fetchall()) != 0 - if benchmark_project_sub.is_new: + if not pub_exists: + pub_cur.execute("CREATE PUBLICATION pub1 FOR TABLE pgbench_accounts, pgbench_history") + + sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'") + sub_exists = len(sub_cur.fetchall()) != 0 + if not sub_exists: sub_cur.execute("truncate table pgbench_accounts") sub_cur.execute("truncate table pgbench_history") - sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1") + sub_cur.execute(f"CREATE SUBSCRIPTION sub1 CONNECTION '{pub_connstr}' PUBLICATION pub1") initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() sub_conn.close() @@ -195,10 +203,15 @@ def test_publisher_restart( pub_conn.autocommit = True sub_conn.autocommit = True with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - if benchmark_project_pub.is_new: + pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'") + pub_exists = len(pub_cur.fetchall()) != 0 + + if not pub_exists: pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history") - if benchmark_project_sub.is_new: + sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'") + sub_exists = len(sub_cur.fetchall()) != 0 + if not sub_exists: sub_cur.execute("truncate table pgbench_accounts") sub_cur.execute("truncate table pgbench_history") diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 281c9271e9..297aedfbed 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -2,7 +2,6 @@ import concurrent.futures import random import time from collections import defaultdict -from typing import Any, Dict import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId @@ -24,51 +23,14 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[ This function takes into account the intersection of the intent and the observed state. If they do not match, it asserts out. """ - tenants = env.storage_controller.tenant_list() - - intent = dict() - observed = dict() - - tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict( - lambda: { - "observed": {"attached": None, "secondary": []}, - "intent": {"attached": None, "secondary": []}, - } - ) - - for t in tenants: - for node_id, loc_state in t["observed"]["locations"].items(): - if ( - loc_state is not None - and "conf" in loc_state - and loc_state["conf"] is not None - and loc_state["conf"]["mode"] - in set(["AttachedSingle", "AttachedMulti", "AttachedStale"]) - ): - observed[t["tenant_shard_id"]] = int(node_id) - tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id) - - if ( - loc_state is not None - and "conf" in loc_state - and loc_state["conf"] is not None - and loc_state["conf"]["mode"] == "Secondary" - ): - tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id)) - - if "attached" in t["intent"]: - intent[t["tenant_shard_id"]] = t["intent"]["attached"] - tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"] - - if "secondary" in t["intent"]: - tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][ - "secondary" - ] - + tenant_placement = env.storage_controller.get_tenants_placement() log.info(f"{tenant_placement=}") matching = { - tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid] + tid: tenant_placement[tid]["intent"]["attached"] + for tid in tenant_placement + if tenant_placement[tid]["intent"]["attached"] + == tenant_placement[tid]["observed"]["attached"] } assert len(matching) == total_shards @@ -217,7 +179,11 @@ def test_storage_controller_many_tenants( # A reconciler operation: migrate a shard. shard_number = rng.randint(0, shard_count - 1) tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count) - dest_ps_id = rng.choice([ps.id for ps in env.pageservers]) + + # Migrate it to its secondary location + desc = env.storage_controller.tenant_describe(tenant_id) + dest_ps_id = desc["shards"][shard_number]["node_secondary"][0] + f = executor.submit( env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id ) @@ -231,7 +197,11 @@ def test_storage_controller_many_tenants( for f in futs: f.result() - # Consistency check is safe here: all the previous operations waited for reconcile before completing + # Some of the operations above (notably migrations) might leave the controller in a state where it has + # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system + # to reach a quiescent state before doing following checks. + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() check_memory() diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 137b0e931d..afa5f6873c 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -496,11 +496,10 @@ def test_historic_storage_formats( # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt) # # Do this _before_ importing to the pageserver, as that import may start writing immediately - metadata_summary = env.storage_scrubber.scan_metadata() + healthy, metadata_summary = env.storage_scrubber.scan_metadata() + assert healthy assert metadata_summary["tenant_count"] >= 1 assert metadata_summary["timeline_count"] >= 1 - assert not metadata_summary["with_errors"] - assert not metadata_summary["with_warnings"] env.neon_cli.import_tenant(dataset.tenant_id) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 8941ddd281..73af7950f1 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -214,12 +214,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = env.storage_scrubber.scan_metadata() + healthy, metadata_summary = env.storage_scrubber.scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 - assert not metadata_summary["with_errors"] - assert not metadata_summary["with_warnings"] + assert healthy def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 53f69b5b26..8746b88a75 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -2,10 +2,11 @@ import json import os import random import time -from typing import Any, Dict, Optional +from pathlib import Path +from typing import Any, Dict, Optional, Union import pytest -from fixtures.common_types import TenantId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver from fixtures.pageserver.common_types import parse_layer_file_name @@ -437,6 +438,35 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): validate_heatmap(heatmap_second) +def list_elegible_layers( + pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId +) -> list[Path]: + """ + The subset of layer filenames that are elegible for secondary download: at time of writing this + is all resident layers which are also visible. + """ + candidates = pageserver.list_layers(tenant_id, timeline_id) + + layer_map = pageserver.http_client().layer_map_info(tenant_id, timeline_id) + + # Map of layer filenames to their visibility the "layer name" is not the same as the filename: add suffix to resolve one to the other + visible_map = dict( + (f"{layer.layer_file_name}-v1-00000001", layer.visible) + for layer in layer_map.historic_layers + ) + + def is_visible(layer_file_name): + try: + return visible_map[str(layer_file_name)] + except KeyError: + # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map + # matches what's on disk. + log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}") + raise + + return list(c for c in candidates if is_visible(c)) + + def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): """ Test the overall data flow in secondary mode: @@ -491,7 +521,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + assert list_elegible_layers(ps_attached, tenant_id, timeline_id) == ps_secondary.list_layers( tenant_id, timeline_id ) @@ -509,9 +539,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) try: - assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( - tenant_id, timeline_id - ) + assert list_elegible_layers( + ps_attached, tenant_id, timeline_id + ) == ps_secondary.list_layers(tenant_id, timeline_id) except: # Do a full listing of the secondary location on errors, to help debug of # https://github.com/neondatabase/neon/issues/6966 @@ -532,8 +562,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # ================================================================== try: log.info("Evicting a layer...") - layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0] - some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1] + layer_to_evict = list_elegible_layers(ps_attached, tenant_id, timeline_id)[0] + some_other_layer = list_elegible_layers(ps_attached, tenant_id, timeline_id)[1] log.info(f"Victim layer: {layer_to_evict.name}") ps_attached.http_client().evict_layer( tenant_id, timeline_id, layer_name=layer_to_evict.name @@ -551,9 +581,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id) - assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( - tenant_id, timeline_id - ) + assert list_elegible_layers( + ps_attached, tenant_id, timeline_id + ) == ps_secondary.list_layers(tenant_id, timeline_id) except: # On assertion failures, log some details to help with debugging heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) @@ -563,7 +593,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - env.storage_scrubber.scan_metadata() + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy # Detach secondary and delete tenant # =================================== diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 6f7ea0092a..45ce5b1c5b 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -144,7 +144,13 @@ def test_pg_regress( ) # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("main") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], + ) endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. @@ -207,7 +213,14 @@ def test_isolation( # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them - endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"]) + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "max_prepared_transactions=100", + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], + ) endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_isolation_regress to run in. @@ -268,7 +281,13 @@ def test_sql_regress( ) # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("main") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], + ) endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 8ed44b1094..f446f4f200 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -53,25 +53,6 @@ def test_proxy_select_1(static_proxy: NeonProxy): assert out[0][0] == 42 -def test_proxy_server_params(static_proxy: NeonProxy): - """ - Test that server params are passing through to postgres - """ - - out = static_proxy.safe_psql( - "select to_json('0 seconds'::interval)", options="-c intervalstyle=iso_8601" - ) - assert out[0][0] == "PT0S" - out = static_proxy.safe_psql( - "select to_json('0 seconds'::interval)", options="-c intervalstyle=sql_standard" - ) - assert out[0][0] == "0" - out = static_proxy.safe_psql( - "select to_json('0 seconds'::interval)", options="-c intervalstyle=postgres" - ) - assert out[0][0] == "00:00:00" - - def test_password_hack(static_proxy: NeonProxy): """ Check the PasswordHack auth flow: an alternative to SCRAM auth for diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 7f30b2d7a7..1011a6fd22 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -124,7 +124,8 @@ def test_sharding_smoke( # Check the scrubber isn't confused by sharded content, then disable # it during teardown because we'll have deleted by then - env.storage_scrubber.scan_metadata() + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy env.storage_controller.pageserver_api().tenant_delete(tenant_id) assert_prefix_empty( diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index eb2cdccdb9..9b2557a165 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -17,6 +17,7 @@ from fixtures.neon_fixtures import ( PgBin, StorageControllerApiException, TokenScope, + last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( @@ -1597,6 +1598,8 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): # Perform a graceful rolling restart for ps in env.pageservers: + env.storage_controller.warm_up_all_secondaries() + env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 ) @@ -1645,6 +1648,115 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): assert_shard_counts_balanced(env, shard_counts, total_shards) +def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Artificially make a tenant shard's secondary location lag behind the primary + and check that storage controller driven node drains skip the lagging tenant shard. + Finally, validate that the tenant shard is migrated when a new drain request comes + in and it's no longer lagging. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.storage_controller_config = { + "max_secondary_lag_bytes": 1 * 1024 * 1024, + } + + env = neon_env_builder.init_configs() + env.start() + + tid, timeline_id = env.neon_cli.create_tenant(placement_policy='{"Attached":1}') + + # Give things a chance to settle. + env.storage_controller.reconcile_until_idle(timeout_secs=30) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + primary: int = locations[0]["node_id"] + not_primary = [ps.id for ps in env.pageservers if ps.id != primary] + assert len(not_primary) == 1 + secondary = not_primary[0] + + log.info(f"Paused secondary downloads on {secondary}") + env.get_pageserver(secondary).http_client().configure_failpoints( + ("secondary-layer-download-pausable", "pause") + ) + + log.info(f"Ingesting some data for {tid}") + + with env.endpoints.create_start("main", tenant_id=tid) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql("CREATE TABLE created_foo(id integer);") + last_flush_lsn_upload(env, endpoint, tid, timeline_id) + + log.info(f"Uploading heatmap from {primary} and requesting download from {secondary}") + + env.get_pageserver(primary).http_client().tenant_heatmap_upload(tid) + env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100) + + def secondary_is_lagging(): + resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid) + lag = resp["bytes_total"] - resp["bytes_downloaded"] + + if lag <= 1 * 1024 * 1024: + raise Exception(f"Secondary lag not big enough: {lag}") + + log.info(f"Looking for lag to develop on the secondary {secondary}") + wait_until(10, 1, secondary_is_lagging) + + log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}") + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + primary, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + assert locations[0]["node_id"] == primary + + log.info(f"Unpausing secondary downloads on {secondary}") + env.get_pageserver(secondary).http_client().configure_failpoints( + ("secondary-layer-download-pausable", "off") + ) + env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100) + + log.info(f"Waiting for lag to reduce on {secondary}") + + def lag_is_acceptable(): + resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid) + lag = resp["bytes_total"] - resp["bytes_downloaded"] + + if lag > 1 * 1024 * 1024: + raise Exception(f"Secondary lag not big enough: {lag}") + + wait_until(10, 1, lag_is_acceptable) + + env.storage_controller.node_configure(primary, {"scheduling": "Active"}) + + log.info(f"Starting drain of primary {primary} with non-laggy secondary {secondary}") + + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + primary, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + assert locations[0]["node_id"] == secondary + + def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_configs() @@ -1671,6 +1783,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): ps_id_to_drain = env.pageservers[0].id + env.storage_controller.warm_up_all_secondaries() env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps_id_to_drain, diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index e3f627b6a6..388f6a9e92 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -516,9 +516,8 @@ def test_scrubber_scan_pageserver_metadata( assert len(index.layer_metadata) > 0 it = iter(index.layer_metadata.items()) - scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) - assert not scan_summary["with_warnings"] - assert not scan_summary["with_errors"] + healthy, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + assert healthy assert env.storage_controller.metadata_health_is_healthy() @@ -532,16 +531,18 @@ def test_scrubber_scan_pageserver_metadata( log.info(f"delete response: {delete_response}") # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings. - scan_summary = env.storage_scrubber.scan_metadata() + _, scan_summary = env.storage_scrubber.scan_metadata() log.info(f"{pprint.pformat(scan_summary)}") assert len(scan_summary["with_warnings"]) > 0 assert env.storage_controller.metadata_health_is_healthy() # Now post to storage controller, expect seeing one unhealthy health record - scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + _, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) log.info(f"{pprint.pformat(scan_summary)}") assert len(scan_summary["with_warnings"]) > 0 unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"] assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id) + + neon_env_builder.disable_scrub_on_exit() diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index 91caad7220..4581008022 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -37,7 +37,9 @@ def test_subscriber_restart(neon_simple_env: NeonEnv): scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin" - query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + # synchronous_commit=on to test a hypothesis for why this test has been flaky. + # XXX: Add link to the issue + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub with (synchronous_commit=on)" scur.execute(query) time.sleep(2) # let initial table sync complete diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index c343b349cf..dadf5ca672 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -128,6 +128,8 @@ def test_tenant_delete_smoke( assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0 + env.pageserver.stop() + def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder): """Reproduction of 2023-11-23 stuck tenants investigation""" @@ -200,11 +202,10 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE if deletion is not None: deletion.join() + env.pageserver.stop() -def test_tenant_delete_races_timeline_creation( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, -): + +def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder): """ Validate that timeline creation executed in parallel with deletion works correctly. @@ -318,6 +319,8 @@ def test_tenant_delete_races_timeline_creation( # We deleted our only tenant, and the scrubber fails if it detects nothing neon_env_builder.disable_scrub_on_exit() + env.pageserver.stop() + def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): """ @@ -341,13 +344,13 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) env.stop() - result = env.storage_scrubber.scan_metadata() - assert result["with_warnings"] == [] + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy env.start() ps_http = env.pageserver.http_client() ps_http.tenant_delete(tenant_id) env.stop() - env.storage_scrubber.scan_metadata() - assert result["with_warnings"] == [] + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 38f8dfa885..b3767a2766 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -165,7 +165,7 @@ def test_ancestor_detach_branched_from( ) all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert all_reparented == [] + assert all_reparented == set() if restart_after: env.pageserver.stop() @@ -534,7 +534,7 @@ def test_compaction_induced_by_detaches_in_history( for _, timeline_id in skip_main: reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert reparented == [], "we have no earlier branches at any level" + assert reparented == set(), "we have no earlier branches at any level" post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total" @@ -774,7 +774,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): else: break - assert reparented == [], "too many retries (None) or unexpected reparentings" + assert reparented == set(), "too many retries (None) or unexpected reparentings" for shard_info in shards: node_id = int(shard_info["node_id"]) diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py new file mode 100644 index 0000000000..24de894687 --- /dev/null +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -0,0 +1,67 @@ +import time + +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) +from fixtures.pageserver.utils import wait_timeline_detail_404 + + +def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start( + initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"} + ) + ps = env.pageserver + http = ps.http_client() + + foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant) + + gc_active_line = ".* gc_loop.*: [12] timelines need GC" + gc_skipped_line = ".* gc_loop.*: Skipping GC: .*" + init_gc_skipped = ".*: initialized with gc blocked.*" + + tenant_before = http.tenant_status(env.initial_tenant) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line) + + assert ps.log_contains(gc_skipped_line, offset) is None + + http.timeline_block_gc(env.initial_tenant, foo_branch) + + tenant_after = http.tenant_status(env.initial_tenant) + assert tenant_before != tenant_after + gc_blocking = tenant_after["gc_blocking"] + assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }" + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + ps.restart() + ps.quiesce_tenants() + + _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + # deletion unblocks gc + http.timeline_delete(env.initial_tenant, foo_branch) + wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line, offset) + + http.timeline_block_gc(env.initial_tenant, env.initial_timeline) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + # removing the manual block also unblocks gc + http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line, offset) + + +def wait_for_another_gc_round(): + time.sleep(2) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 5e9a42f6b4..1f220eec9e 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -936,6 +936,9 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline + # just make sure this doesn't hit an assertion + client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True) + # load in some data endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) endpoint.safe_psql_many( diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index dbd0e6428b..7bbe834c8c 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit dbd0e6428b9274d72a10ac29bd3e3162faf109d4 +Subproject commit 7bbe834c8c2dc37802eca8484311599bc47341f6 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 035b73a9c5..9eba7dd382 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 035b73a9c5998f9a0ef35cc8df1bae680bf770fc +Subproject commit 9eba7dd382606ffca43aca865f337ec21bcdac73 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index b39f316137..5377f5ed72 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit b39f316137fdd29e2da15d2af2fdd1cfd18163be +Subproject commit 5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3 diff --git a/vendor/revisions.json b/vendor/revisions.json index eeebd646f5..570dfc1550 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "v16": ["16.3", "b39f316137fdd29e2da15d2af2fdd1cfd18163be"], - "v15": ["15.7", "035b73a9c5998f9a0ef35cc8df1bae680bf770fc"], - "v14": ["14.12", "dbd0e6428b9274d72a10ac29bd3e3162faf109d4"] + "v16": ["16.3", "5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3"], + "v15": ["15.7", "9eba7dd382606ffca43aca865f337ec21bcdac73"], + "v14": ["14.12", "7bbe834c8c2dc37802eca8484311599bc47341f6"] } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 7d005c7139..41d6e11725 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -416,7 +416,7 @@ build: | # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor # requires cgroup v2, so we'll build cgroup-tools ourselves. FROM debian:bullseye-slim as libcgroup-builder - ENV LIBCGROUP_VERSION v2.0.3 + ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ && apt update \ @@ -460,7 +460,7 @@ build: | pkg-config # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) - ENV PGBOUNCER_TAG pgbouncer_1_22_1 + ENV PGBOUNCER_TAG=pgbouncer_1_22_1 RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \