diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 1e6c2d0aa2..667ff7f92e 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -32,3 +32,4 @@ config-variables: - NEON_DEV_AWS_ACCOUNT_ID - NEON_PROD_AWS_ACCOUNT_ID - AWS_ECR_REGION + - BENCHMARK_LARGE_OLTP_PROJECTID diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index 9f752d5a89..71dd6f3af2 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -84,7 +84,13 @@ runs: --header "Authorization: Bearer ${API_KEY}" ) - role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name') + role_name=$(echo "$roles" | jq --raw-output ' + (.roles | map(select(.protected == false))) as $roles | + if any($roles[]; .name == "neondb_owner") + then "neondb_owner" + else $roles[0].name + end + ') echo "role_name=${role_name}" >> $GITHUB_OUTPUT env: API_HOST: ${{ inputs.api_host }} @@ -107,13 +113,13 @@ runs: ) if [ -z "${reset_password}" ]; then - sleep 1 + sleep $i continue fi password=$(echo $reset_password | jq --raw-output '.role.password') if [ "${password}" == "null" ]; then - sleep 1 + sleep $i # increasing backoff continue fi diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 122fe48b68..fa6f882161 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -44,6 +44,11 @@ inputs: description: 'Postgres version to use for tests' required: false default: 'v16' + sanitizers: + description: 'enabled or disabled' + required: false + default: 'disabled' + type: string benchmark_durations: description: 'benchmark durations JSON' required: false @@ -59,7 +64,7 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact path: /tmp/neon aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} @@ -112,6 +117,7 @@ runs: ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') RERUN_FAILED: ${{ inputs.rerun_failed }} PG_VERSION: ${{ inputs.pg_version }} + SANITIZERS: ${{ inputs.sanitizers }} shell: bash -euxo pipefail {0} run: | # PLATFORM will be embedded in the perf test report diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 30fde127b0..6a2070424a 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -280,7 +280,7 @@ jobs: - name: Upload Neon artifact uses: ./.github/actions/upload with: - name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact path: /tmp/neon aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -347,6 +347,7 @@ jobs: real_s3_region: eu-central-1 rerun_failed: true pg_version: ${{ matrix.pg_version }} + sanitizers: ${{ inputs.sanitizers }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds. # Attempt to stop tests gracefully to generate test reports @@ -359,7 +360,6 @@ jobs: PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} - SANITIZERS: ${{ inputs.sanitizers }} # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index ffb6c65af9..ff7db02e42 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -141,6 +141,8 @@ jobs: --ignore test_runner/performance/test_physical_replication.py --ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py --ignore test_runner/performance/test_cumulative_statistics_persistence.py + --ignore test_runner/performance/test_perf_many_relations.py + --ignore test_runner/performance/test_perf_oltp_large_tenant.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 639c258c5c..66758ca49f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -692,15 +692,15 @@ jobs: neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - vm-compute-node-image: + vm-compute-node-image-arch: needs: [ check-permissions, meta, compute-node-image ] if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} - runs-on: [ self-hosted, large ] + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} strategy: fail-fast: false matrix: + arch: [ amd64, arm64 ] version: - # see the comment for `compute-node-image-arch` job - pg: v14 debian: bullseye - pg: v15 @@ -717,7 +717,7 @@ jobs: - name: Downloading vm-builder run: | - curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-amd64 -o vm-builder + curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder chmod +x vm-builder - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 @@ -738,12 +738,37 @@ jobs: -size=2G \ -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ - -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ - -target-arch=linux/amd64 + -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \ + -target-arch=linux/${{ matrix.arch }} - name: Pushing vm-compute-node image run: | - docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} + docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} + + vm-compute-node-image: + needs: [ vm-compute-node-image-arch, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + runs-on: ubuntu-22.04 + strategy: + matrix: + version: + # see the comment for `compute-node-image-arch` job + - pg: v14 + - pg: v15 + - pg: v16 + - pg: v17 + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch compute-node image + run: | + docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \ + neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64 + test-images: needs: [ check-permissions, meta, neon-image, compute-node-image ] @@ -831,7 +856,7 @@ jobs: || needs.meta.outputs.run-kind == 'pr' && needs.meta.outputs.build-tag || needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release }} - TEST_EXTENSIONS_TAG: latest + TEST_EXTENSIONS_TAG: ${{ needs.meta.outputs.previous-compute-release }} NEW_COMPUTE_TAG: ${{ needs.meta.outputs.build-tag }} OLD_COMPUTE_TAG: ${{ needs.meta.outputs.previous-compute-release }} run: ./docker-compose/test_extensions_upgrade.sh @@ -1036,7 +1061,7 @@ jobs: exit 1 deploy: - needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ] + needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ] # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }} permissions: diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml index 71c5158ef6..f2376306dc 100644 --- a/.github/workflows/force-test-extensions-upgrade.yml +++ b/.github/workflows/force-test-extensions-upgrade.yml @@ -52,8 +52,9 @@ jobs: - name: Test extension upgrade timeout-minutes: 20 env: - NEWTAG: latest - OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + NEW_COMPUTE_TAG: latest + OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} PG_VERSION: ${{ matrix.pg-version }} FORCE_ALL_UPGRADE_TESTS: true run: ./docker-compose/test_extensions_upgrade.sh diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml new file mode 100644 index 0000000000..f33e11cd08 --- /dev/null +++ b/.github/workflows/large_oltp_benchmark.yml @@ -0,0 +1,147 @@ +name: large oltp benchmark + +on: + # uncomment to run on push for debugging your PR + push: + branches: [ bodobolero/synthetic_oltp_workload ] + + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks + workflow_dispatch: # adds ability to run this manually + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow globally because we need dedicated resources which only exist once + group: large-oltp-bench-workflow + cancel-in-progress: true + +jobs: + oltp: + strategy: + fail-fast: false # allow other variants to continue even if one fails + matrix: + include: + - target: new_branch + custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 + - target: reuse_branch + custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 + max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials + env: + TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h + TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }} + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + PG_VERSION: 16 # pre-determined by pre-determined project + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }} + PLATFORM: ${{ matrix.target }} + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + # Increase timeout to 8h, default timeout is 6h + timeout-minutes: 480 + + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials # necessary to download artefacts + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Create Neon Branch for large tenant + if: ${{ matrix.target == 'new_branch' }} + id: create-neon-branch-oltp-target + uses: ./.github/actions/neon-branch-create + with: + project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Set up Connection String + id: set-up-connstr + run: | + case "${{ matrix.target }}" in + new_branch) + CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }} + ;; + reuse_branch) + CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }} + ;; + *) + echo >&2 "Unknown target=${{ matrix.target }}" + exit 1 + ;; + esac + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + - name: Benchmark pgbench with custom-scripts + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Delete Neon Branch for large tenant + if: ${{ always() && matrix.target == 'new_branch' }} + uses: ./.github/actions/neon-branch-delete + with: + project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }} + branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + id: create-allure-report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + with: + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Periodic large oltp perf testing: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index af877029e4..f854bf3212 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central on: schedule: # * is a special character in YAML so you have to quote this string - # ┌───────────── minute (0 - 59) - # │ ┌───────────── hour (0 - 23) - # │ │ ┌───────────── day of the month (1 - 31) - # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) - # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '0 18 * * *' # Runs at 6 PM UTC every day + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 */3 * * *' # Runs every 3 hours workflow_dispatch: # Allows manual triggering of the workflow inputs: commit_hash: @@ -78,8 +78,10 @@ jobs: run: | if [ -z "$INPUT_COMMIT_HASH" ]; then echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV + echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV else echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV + echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV fi - name: Start Bench with run_id @@ -89,7 +91,7 @@ jobs: -H 'accept: application/json' \ -H 'Content-Type: application/json' \ -H "Authorization: Bearer $API_KEY" \ - -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}" + -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}" - name: Poll Test Status id: poll_step diff --git a/CODEOWNERS b/CODEOWNERS index 71b5e65f94..ab6d2257a4 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,8 +1,9 @@ # Autoscaling /libs/vm_monitor/ @neondatabase/autoscaling -# DevProd -/.github/ @neondatabase/developer-productivity +# DevProd & PerfCorr +/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness +/test_runner/ @neondatabase/performance-correctness # Compute /pgxn/ @neondatabase/compute diff --git a/Cargo.lock b/Cargo.lock index 293ed465ff..d3b09fa360 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -783,6 +783,28 @@ dependencies = [ "tracing", ] +[[package]] +name = "axum-extra" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b" +dependencies = [ + "axum", + "axum-core", + "bytes", + "futures-util", + "headers", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "mime", + "pin-project-lite", + "serde", + "tower 0.5.2", + "tower-layer", + "tower-service", +] + [[package]] name = "azure_core" version = "0.21.0" @@ -925,9 +947,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5" [[package]] name = "base64" -version = "0.21.1" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "base64" @@ -1105,9 +1127,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.30" +version = "1.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945" +checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" dependencies = [ "jobserver", "libc", @@ -1305,6 +1327,7 @@ dependencies = [ "aws-sdk-s3", "aws-smithy-types", "axum", + "axum-extra", "base64 0.13.1", "bytes", "camino", @@ -1316,6 +1339,7 @@ dependencies = [ "flate2", "futures", "http 1.1.0", + "jsonwebtoken", "metrics", "nix 0.27.1", "notify", @@ -2297,7 +2321,7 @@ name = "framed-websockets" version = "0.1.0" source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "bytemuck", "bytes", "futures-core", @@ -2410,9 +2434,9 @@ checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" -version = "3.0.2" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" [[package]] name = "futures-util" @@ -2515,6 +2539,27 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "governor" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0" +dependencies = [ + "cfg-if", + "dashmap 6.1.0", + "futures-sink", + "futures-timer", + "futures-util", + "no-std-compat", + "nonzero_ext", + "parking_lot 0.12.1", + "portable-atomic", + "quanta", + "rand 0.8.5", + "smallvec", + "spinning_top", +] + [[package]] name = "group" version = "0.12.1" @@ -2632,7 +2677,7 @@ version = "7.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "byteorder", "crossbeam-channel", "flate2", @@ -2640,6 +2685,30 @@ dependencies = [ "num-traits", ] +[[package]] +name = "headers" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9" +dependencies = [ + "base64 0.21.7", + "bytes", + "headers-core", + "http 1.1.0", + "httpdate", + "mime", + "sha1", +] + +[[package]] +name = "headers-core" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4" +dependencies = [ + "http 1.1.0", +] + [[package]] name = "heck" version = "0.5.0" @@ -2777,12 +2846,9 @@ name = "http-utils" version = "0.1.0" dependencies = [ "anyhow", - "backtrace", "bytes", "fail", - "flate2", "hyper 0.14.30", - "inferno 0.12.0", "itertools 0.10.5", "jemalloc_pprof", "metrics", @@ -3281,9 +3347,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jemalloc_pprof" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb" +checksum = "5622af6d21ff86ed7797ef98e11b8f302da25ec69a7db9f6cde8e2e1c8df9992" dependencies = [ "anyhow", "libc", @@ -3367,7 +3433,7 @@ version = "9.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "js-sys", "pem", "ring", @@ -3482,9 +3548,9 @@ dependencies = [ [[package]] name = "mappings" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e" +checksum = "e434981a332777c2b3062652d16a55f8e74fa78e6b1882633f0d77399c84fc2a" dependencies = [ "anyhow", "libc", @@ -3725,6 +3791,12 @@ dependencies = [ "memoffset 0.9.0", ] +[[package]] +name = "no-std-compat" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c" + [[package]] name = "nom" version = "7.1.3" @@ -3735,6 +3807,12 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nonzero_ext" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21" + [[package]] name = "notify" version = "8.0.0" @@ -4225,6 +4303,7 @@ dependencies = [ "tracing", "url", "utils", + "uuid", "wal_decoder", "walkdir", "workspace_hack", @@ -4307,9 +4386,9 @@ dependencies = [ [[package]] name = "papaya" -version = "0.1.8" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c" +checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd" dependencies = [ "equivalent", "seize", @@ -4437,7 +4516,7 @@ version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "serde", ] @@ -4591,6 +4670,12 @@ dependencies = [ "never-say-never", ] +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + [[package]] name = "postgres" version = "0.19.7" @@ -4755,12 +4840,14 @@ dependencies = [ [[package]] name = "pprof_util" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781" +checksum = "9fa015c78eed2130951e22c58d2095849391e73817ab2e74f71b0b9f63dd8416" dependencies = [ "anyhow", + "backtrace", "flate2", + "inferno 0.12.0", "num", "paste", "prost", @@ -5052,6 +5139,21 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "quanta" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi 0.11.0+wasi-snapshot-preview1", + "web-sys", + "winapi", +] + [[package]] name = "quick-xml" version = "0.26.0" @@ -5182,6 +5284,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "raw-cpuid" +version = "11.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e" +dependencies = [ + "bitflags 2.8.0", +] + [[package]] name = "rayon" version = "1.7.0" @@ -5516,16 +5627,16 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.6" +version = "0.17.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "684d5e6e18f669ccebf64a92236bb7db9a34f07be010e3627368182027180866" +checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee" dependencies = [ "cc", + "cfg-if", "getrandom 0.2.11", "libc", - "spin", "untrusted", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -5752,7 +5863,7 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", ] [[package]] @@ -5761,7 +5872,7 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "rustls-pki-types", ] @@ -6000,9 +6111,9 @@ dependencies = [ [[package]] name = "seize" -version = "0.4.9" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93" +checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7" dependencies = [ "libc", "windows-sys 0.52.0", @@ -6395,6 +6506,15 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "spinning_top" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300" +dependencies = [ + "lock_api", +] + [[package]] name = "spki" version = "0.6.0" @@ -6471,6 +6591,7 @@ dependencies = [ "diesel_migrations", "fail", "futures", + "governor", "hex", "http-utils", "humantime", @@ -7285,10 +7406,12 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697" dependencies = [ + "base64 0.22.1", "bitflags 2.8.0", "bytes", "http 1.1.0", "http-body 1.0.0", + "mime", "pin-project-lite", "tower-layer", "tower-service", @@ -7642,7 +7765,6 @@ dependencies = [ "anyhow", "arc-swap", "async-compression", - "backtrace", "bincode", "byteorder", "bytes", @@ -8196,7 +8318,7 @@ dependencies = [ "ahash", "anyhow", "base64 0.13.1", - "base64 0.21.1", + "base64 0.21.7", "base64ct", "bytes", "camino", diff --git a/Cargo.toml b/Cargo.toml index ff45d46a47..d11fe4f449 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,7 +53,6 @@ anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } atomic-take = "1.1.0" -backtrace = "0.3.74" flate2 = "1.0.26" assert-json-diff = "2" async-stream = "0.3" @@ -68,6 +67,7 @@ aws-credential-types = "1.2.0" aws-sigv4 = { version = "1.2", features = ["sign-http"] } aws-types = "1.3" axum = { version = "0.8.1", features = ["ws"] } +axum-extra = { version = "0.10.0", features = ["typed-header"] } base64 = "0.13.0" bincode = "1.3" bindgen = "0.71" @@ -95,6 +95,7 @@ futures = "0.3" futures-core = "0.3" futures-util = "0.3" git-version = "0.3" +governor = "0.8" hashbrown = "0.14" hashlink = "0.9.1" hdrhistogram = "7.5.2" @@ -113,11 +114,10 @@ hyper-util = "0.1" tokio-tungstenite = "0.21.0" indexmap = "2" indoc = "2" -inferno = "0.12.0" ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" -jemalloc_pprof = "0.6" +jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] } jsonwebtoken = "9" lasso = "0.7" libc = "0.2" @@ -192,7 +192,7 @@ toml = "0.8" toml_edit = "0.22" tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]} tower = { version = "0.5.2", default-features = false } -tower-http = { version = "0.6.2", features = ["request-id", "trace"] } +tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] } # This revision uses opentelemetry 0.27. There's no tag for it. tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" } diff --git a/Makefile b/Makefile index 42ee643bb5..0911465fb8 100644 --- a/Makefile +++ b/Makefile @@ -11,15 +11,16 @@ ICU_PREFIX_DIR := /usr/local/icu # BUILD_TYPE ?= debug WITH_SANITIZERS ?= no +PG_CFLAGS = -fsigned-char ifeq ($(BUILD_TYPE),release) PG_CONFIGURE_OPTS = --enable-debug --with-openssl - PG_CFLAGS = -O2 -g3 $(CFLAGS) + PG_CFLAGS += -O2 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) # Unfortunately, `--profile=...` is a nightly feature CARGO_BUILD_FLAGS += --release else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend - PG_CFLAGS = -O0 -g3 $(CFLAGS) + PG_CFLAGS += -O0 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) @@ -159,6 +160,8 @@ postgres-%: postgres-configure-% \ $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install +@echo "Compiling pageinspect $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install + +@echo "Compiling pg_trgm $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install +@echo "Compiling amcheck $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install +@echo "Compiling test_decoding $*" diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 0cdb44853f..6e46185e36 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -162,7 +162,7 @@ FROM build-deps AS pg-build ARG PG_VERSION COPY vendor/postgres-${PG_VERSION:?} postgres RUN cd postgres && \ - export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \ + export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \ --with-icu --with-libxml --with-libxslt --with-lz4" && \ if [ "${PG_VERSION:?}" != "v14" ]; then \ # zstd is available only from PG15 @@ -1484,7 +1484,7 @@ WORKDIR /ext-src COPY compute/patches/pg_duckdb_v031.patch . COPY compute/patches/duckdb_v120.patch . # pg_duckdb build requires source dir to be a git repo to get submodules -# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: +# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: # - extension management function duckdb.install_extension() # - access to duckdb.extensions table and its sequence RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ @@ -1499,8 +1499,8 @@ ARG PG_VERSION COPY --from=pg_duckdb-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_duckdb-src RUN make install -j $(getconf _NPROCESSORS_ONLN) && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control - + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control + ######################################################################################### # # Layer "pg_repack" @@ -1758,15 +1758,15 @@ ARG TARGETARCH # test_runner/regress/test_compute_metrics.py # See comment on the top of the file regading `echo`, `-e` and `\n` RUN if [ "$TARGETARCH" = "amd64" ]; then\ - postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\ + postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\ pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\ sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\ else\ - postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\ + postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\ pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\ sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\ fi\ - && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\ + && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ @@ -1933,6 +1933,7 @@ RUN apt update && \ locales \ procps \ ca-certificates \ + rsyslog \ $VERSION_INSTALLS && \ apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 @@ -1978,6 +1979,13 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo # Make the libraries we built available RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig +# rsyslog config permissions +# directory for rsyslogd pid file +RUN mkdir /var/run/rsyslogd && \ + chown -R postgres:postgres /var/run/rsyslogd && \ + chown -R postgres:postgres /etc/rsyslog.d/ + + ENV LANG=en_US.utf8 USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index f8f4cab63b..da2b86d542 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -29,6 +29,7 @@ import 'sql_exporter/lfc_approximate_working_set_size.libsonnet', import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet', import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_chunk_size.libsonnet', import 'sql_exporter/lfc_hits.libsonnet', import 'sql_exporter/lfc_misses.libsonnet', import 'sql_exporter/lfc_used.libsonnet', diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql index 9cbbdfd8a3..fe0360ab5c 100644 --- a/compute/etc/sql_exporter/db_total_size.sql +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -1 +1,5 @@ -SELECT sum(pg_database_size(datname)) AS total FROM pg_database; +SELECT sum(pg_database_size(datname)) AS total +FROM pg_database +-- Ignore invalid databases, as we will likely have problems with +-- getting their size from the Pageserver. +WHERE datconnlimit != -2; diff --git a/compute/etc/sql_exporter/lfc_chunk_size.libsonnet b/compute/etc/sql_exporter/lfc_chunk_size.libsonnet new file mode 100644 index 0000000000..bbe56f869f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_chunk_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_chunk_size', + type: 'gauge', + help: 'LFC chunk size, measured in 8KiB pages', + key_labels: null, + values: [ + 'lfc_chunk_size_pages', + ], + query: importstr 'sql_exporter/lfc_chunk_size.sql', +} diff --git a/compute/etc/sql_exporter/lfc_chunk_size.sql b/compute/etc/sql_exporter/lfc_chunk_size.sql new file mode 100644 index 0000000000..0905870064 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_chunk_size.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages'; diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql index 00ada87370..12e6c4ae59 100644 --- a/compute/etc/sql_exporter/pg_stats_userdb.sql +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -1,10 +1,20 @@ -- We export stats for 10 non-system databases. Without this limit it is too -- easy to abuse the system by creating lots of databases. -SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, - tup_updated AS updated, tup_deleted AS deleted, datname +SELECT pg_database_size(datname) AS db_size, + deadlocks, + tup_inserted AS inserted, + tup_updated AS updated, + tup_deleted AS deleted, + datname FROM pg_stat_database WHERE datname IN ( SELECT datname FROM pg_database - WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10 + -- Ignore invalid databases, as we will likely have problems with + -- getting their size from the Pageserver. + WHERE datconnlimit != -2 + AND datname <> 'postgres' + AND NOT datistemplate + ORDER BY oid + LIMIT 10 ); diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index ff4c3387d9..e6707381ac 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -39,6 +39,10 @@ commands: user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' + - name: rsyslogd + user: postgres + sysvInitAction: respawn + shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: @@ -54,7 +58,7 @@ files: # regardless of hostname (ALL) # # Also allow it to shut down the VM. The fast_import job does that when it's finished. - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -69,6 +73,12 @@ files: } memory {} } +# Create dummy rsyslog config, because it refuses to start without at least one action configured. +# compute_ctl will rewrite this file with the actual configuration, if needed. + - filename: compute_rsyslog.conf + content: | + *.* /dev/null + $IncludeConfig /etc/rsyslog.d/*.conf build: | # Build cgroup-tools # @@ -132,6 +142,12 @@ merge: | RUN set -e \ && chmod 0644 /etc/cgconfig.conf + + COPY compute_rsyslog.conf /etc/compute_rsyslog.conf + RUN chmod 0666 /etc/compute_rsyslog.conf + RUN chmod 0666 /var/log/ + + COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index c001040bc9..c89ee112dc 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -39,6 +39,10 @@ commands: user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' + - name: rsyslogd + user: postgres + sysvInitAction: respawn + shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: @@ -54,7 +58,7 @@ files: # regardless of hostname (ALL) # # Also allow it to shut down the VM. The fast_import job does that when it's finished. - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -69,6 +73,12 @@ files: } memory {} } +# Create dummy rsyslog config, because it refuses to start without at least one action configured. +# compute_ctl will rewrite this file with the actual configuration, if needed. + - filename: compute_rsyslog.conf + content: | + *.* /dev/null + $IncludeConfig /etc/rsyslog.d/*.conf build: | # Build cgroup-tools # @@ -128,6 +138,11 @@ merge: | RUN set -e \ && chmod 0644 /etc/cgconfig.conf + COPY compute_rsyslog.conf /etc/compute_rsyslog.conf + RUN chmod 0666 /etc/compute_rsyslog.conf + RUN chmod 0666 /var/log/ + + COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8f3bcbeef8..dd2896714d 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -17,6 +17,7 @@ aws-sdk-kms.workspace = true aws-smithy-types.workspace = true anyhow.workspace = true axum = { workspace = true, features = [] } +axum-extra.workspace = true camino.workspace = true chrono.workspace = true cfg-if.workspace = true @@ -25,6 +26,7 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true +jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true notify.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 6dae1a2753..fc7a3e2827 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -33,39 +33,27 @@ //! -b /usr/local/bin/postgres \ //! -r http://pg-ext-s3-gateway \ //! ``` -use std::collections::HashMap; use std::ffi::OsString; use std::fs::File; use std::path::Path; use std::process::exit; -use std::str::FromStr; -use std::sync::atomic::Ordering; -use std::sync::{Arc, Condvar, Mutex, RwLock, mpsc}; +use std::sync::mpsc; use std::thread; use std::time::Duration; use anyhow::{Context, Result}; -use chrono::Utc; use clap::Parser; -use compute_api::responses::{ComputeCtlConfig, ComputeStatus}; +use compute_api::responses::ComputeCtlConfig; use compute_api::spec::ComputeSpec; -use compute_tools::compute::{ - ComputeNode, ComputeState, PG_PID, ParsedSpec, forward_termination_signal, -}; -use compute_tools::configurator::launch_configurator; -use compute_tools::disk_quota::set_disk_quota; +use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal}; use compute_tools::extension_server::get_pg_version_string; -use compute_tools::http::server::Server; use compute_tools::logger::*; -use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; -use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; -use compute_tools::swap::resize_swap; use rlimit::{Resource, setrlimit}; use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM}; use signal_hook::iterator::Signals; -use tracing::{error, info, warn}; +use tracing::{error, info}; use url::Url; use utils::failpoint_support; @@ -164,29 +152,41 @@ fn main() -> Result<()> { // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; - let (pg_handle, start_pg_result) = { - // Enter startup tracing context - let _startup_context_guard = startup_context_from_env(); + let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; - let cli_spec = try_spec_from_cli(&cli)?; + let cli_spec = try_spec_from_cli(&cli)?; - let compute = wait_spec(build_tag, &cli, cli_spec)?; + let compute_node = ComputeNode::new( + ComputeNodeParams { + compute_id: cli.compute_id, + connstr, + pgdata: cli.pgdata.clone(), + pgbin: cli.pgbin.clone(), + pgversion: get_pg_version_string(&cli.pgbin), + external_http_port: cli.external_http_port, + internal_http_port: cli.internal_http_port, + ext_remote_storage: cli.remote_ext_config.clone(), + resize_swap_on_bind: cli.resize_swap_on_bind, + set_disk_quota_for_fs: cli.set_disk_quota_for_fs, + #[cfg(target_os = "linux")] + filecache_connstr: cli.filecache_connstr, + #[cfg(target_os = "linux")] + cgroup: cli.cgroup, + #[cfg(target_os = "linux")] + vm_monitor_addr: cli.vm_monitor_addr, + build_tag, - start_postgres(&cli, compute)? + live_config_allowed: cli_spec.live_config_allowed, + }, + cli_spec.spec, + cli_spec.compute_ctl_config, + )?; - // Startup is finished, exit the startup tracing span - }; - - // PostgreSQL is now running, if startup was successful. Wait until it exits. - let wait_pg_result = wait_postgres(pg_handle)?; - - let delay_exit = cleanup_after_postgres_exit(start_pg_result)?; - - maybe_delay_exit(delay_exit); + let exit_code = compute_node.run()?; scenario.teardown(); - deinit_and_exit(wait_pg_result); + deinit_and_exit(exit_code); } async fn init() -> Result { @@ -207,56 +207,6 @@ async fn init() -> Result { Ok(build_tag) } -fn startup_context_from_env() -> Option { - // Extract OpenTelemetry context for the startup actions from the - // TRACEPARENT and TRACESTATE env variables, and attach it to the current - // tracing context. - // - // This is used to propagate the context for the 'start_compute' operation - // from the neon control plane. This allows linking together the wider - // 'start_compute' operation that creates the compute container, with the - // startup actions here within the container. - // - // There is no standard for passing context in env variables, but a lot of - // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See - // https://github.com/open-telemetry/opentelemetry-specification/issues/740 - // - // Switch to the startup context here, and exit it once the startup has - // completed and Postgres is up and running. - // - // If this pod is pre-created without binding it to any particular endpoint - // yet, this isn't the right place to enter the startup context. In that - // case, the control plane should pass the tracing context as part of the - // /configure API call. - // - // NOTE: This is supposed to only cover the *startup* actions. Once - // postgres is configured and up-and-running, we exit this span. Any other - // actions that are performed on incoming HTTP requests, for example, are - // performed in separate spans. - // - // XXX: If the pod is restarted, we perform the startup actions in the same - // context as the original startup actions, which probably doesn't make - // sense. - let mut startup_tracing_carrier: HashMap = HashMap::new(); - if let Ok(val) = std::env::var("TRACEPARENT") { - startup_tracing_carrier.insert("traceparent".to_string(), val); - } - if let Ok(val) = std::env::var("TRACESTATE") { - startup_tracing_carrier.insert("tracestate".to_string(), val); - } - if !startup_tracing_carrier.is_empty() { - use opentelemetry::propagation::TextMapPropagator; - use opentelemetry_sdk::propagation::TraceContextPropagator; - let guard = TraceContextPropagator::new() - .extract(&startup_tracing_carrier) - .attach(); - info!("startup tracing context attached"); - Some(guard) - } else { - None - } -} - fn try_spec_from_cli(cli: &Cli) -> Result { // First, try to get cluster spec from the cli argument if let Some(ref spec_json) = cli.spec_json { @@ -307,357 +257,7 @@ struct CliSpecParams { live_config_allowed: bool, } -fn wait_spec( - build_tag: String, - cli: &Cli, - CliSpecParams { - spec, - live_config_allowed, - compute_ctl_config: _, - }: CliSpecParams, -) -> Result> { - let mut new_state = ComputeState::new(); - let spec_set; - - if let Some(spec) = spec { - let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; - info!("new pspec.spec: {:?}", pspec.spec); - new_state.pspec = Some(pspec); - spec_set = true; - } else { - spec_set = false; - } - let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; - let conn_conf = postgres::config::Config::from_str(connstr.as_str()) - .context("cannot build postgres config from connstr")?; - let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str()) - .context("cannot build tokio postgres config from connstr")?; - let compute_node = ComputeNode { - compute_id: cli.compute_id.clone(), - connstr, - conn_conf, - tokio_conn_conf, - pgdata: cli.pgdata.clone(), - pgbin: cli.pgbin.clone(), - pgversion: get_pg_version_string(&cli.pgbin), - external_http_port: cli.external_http_port, - internal_http_port: cli.internal_http_port, - live_config_allowed, - state: Mutex::new(new_state), - state_changed: Condvar::new(), - ext_remote_storage: cli.remote_ext_config.clone(), - ext_download_progress: RwLock::new(HashMap::new()), - build_tag, - }; - let compute = Arc::new(compute_node); - - // If this is a pooled VM, prewarm before starting HTTP server and becoming - // available for binding. Prewarming helps Postgres start quicker later, - // because QEMU will already have its memory allocated from the host, and - // the necessary binaries will already be cached. - if !spec_set { - compute.prewarm_postgres()?; - } - - // Launch the external HTTP server first, so that we can serve control plane - // requests while configuration is still in progress. - Server::External(cli.external_http_port).launch(&compute); - - // The internal HTTP server could be launched later, but there isn't much - // sense in waiting. - Server::Internal(cli.internal_http_port).launch(&compute); - - if !spec_set { - // No spec provided, hang waiting for it. - info!("no compute spec provided, waiting"); - - let mut state = compute.state.lock().unwrap(); - while state.status != ComputeStatus::ConfigurationPending { - state = compute.state_changed.wait(state).unwrap(); - - if state.status == ComputeStatus::ConfigurationPending { - info!("got spec, continue configuration"); - // Spec is already set by the http server handler. - break; - } - } - - // Record for how long we slept waiting for the spec. - let now = Utc::now(); - state.metrics.wait_for_spec_ms = now - .signed_duration_since(state.start_time) - .to_std() - .unwrap() - .as_millis() as u64; - - // Reset start time, so that the total startup time that is calculated later will - // not include the time that we waited for the spec. - state.start_time = now; - } - - launch_lsn_lease_bg_task_for_static(&compute); - - Ok(compute) -} - -fn start_postgres( - cli: &Cli, - compute: Arc, -) -> Result<(Option, StartPostgresResult)> { - // We got all we need, update the state. - let mut state = compute.state.lock().unwrap(); - - // Create a tracing span for the startup operation. - // - // We could otherwise just annotate the function with #[instrument], but if - // we're being configured from a /configure HTTP request, we want the - // startup to be considered part of the /configure request. - let _this_entered = { - // Temporarily enter the /configure request's span, so that the new span - // becomes its child. - let _parent_entered = state.startup_span.take().map(|p| p.entered()); - - tracing::info_span!("start_postgres") - } - .entered(); - - state.set_status(ComputeStatus::Init, &compute.state_changed); - - info!( - "running compute with features: {:?}", - state.pspec.as_ref().unwrap().spec.features - ); - // before we release the mutex, fetch some parameters for later. - let &ComputeSpec { - swap_size_bytes, - disk_quota_bytes, - #[cfg(target_os = "linux")] - disable_lfc_resizing, - .. - } = &state.pspec.as_ref().unwrap().spec; - drop(state); - - // Launch remaining service threads - let _monitor_handle = launch_monitor(&compute); - let _configurator_handle = launch_configurator(&compute); - - let mut prestartup_failed = false; - let mut delay_exit = false; - - // Resize swap to the desired size if the compute spec says so - if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) { - // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion - // *before* starting postgres. - // - // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this - // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets - // OOM-killed during startup because swap wasn't available yet. - match resize_swap(size_bytes) { - Ok(()) => { - let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. - info!(%size_bytes, %size_mib, "resized swap"); - } - Err(err) => { - let err = err.context("failed to resize swap"); - error!("{err:#}"); - - // Mark compute startup as failed; don't try to start postgres, and report this - // error to the control plane when it next asks. - prestartup_failed = true; - compute.set_failed_status(err); - delay_exit = true; - } - } - } - - // Set disk quota if the compute spec says so - if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = - (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref()) - { - match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) { - Ok(()) => { - let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display. - info!(%disk_quota_bytes, %size_mib, "set disk quota"); - } - Err(err) => { - let err = err.context("failed to set disk quota"); - error!("{err:#}"); - - // Mark compute startup as failed; don't try to start postgres, and report this - // error to the control plane when it next asks. - prestartup_failed = true; - compute.set_failed_status(err); - delay_exit = true; - } - } - } - - // Start Postgres - let mut pg = None; - if !prestartup_failed { - pg = match compute.start_compute() { - Ok(pg) => { - info!(postmaster_pid = %pg.0.id(), "Postgres was started"); - Some(pg) - } - Err(err) => { - error!("could not start the compute node: {:#}", err); - compute.set_failed_status(err); - delay_exit = true; - None - } - }; - } else { - warn!("skipping postgres startup because pre-startup step failed"); - } - - // Start the vm-monitor if directed to. The vm-monitor only runs on linux - // because it requires cgroups. - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - use std::env; - use tokio_util::sync::CancellationToken; - - // This token is used internally by the monitor to clean up all threads - let token = CancellationToken::new(); - - // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC - let pgconnstr = if disable_lfc_resizing.unwrap_or(false) { - None - } else { - Some(cli.filecache_connstr.clone()) - }; - - let vm_monitor = if env::var_os("AUTOSCALING").is_some() { - let vm_monitor = tokio::spawn(vm_monitor::start( - Box::leak(Box::new(vm_monitor::Args { - cgroup: Some(cli.cgroup.clone()), - pgconnstr, - addr: cli.vm_monitor_addr.clone(), - })), - token.clone(), - )); - Some(vm_monitor) - } else { - None - }; - } - } - - Ok(( - pg, - StartPostgresResult { - delay_exit, - compute, - #[cfg(target_os = "linux")] - token, - #[cfg(target_os = "linux")] - vm_monitor, - }, - )) -} - -type PostgresHandle = (std::process::Child, tokio::task::JoinHandle>); - -struct StartPostgresResult { - delay_exit: bool, - // passed through from WaitSpecResult - compute: Arc, - - #[cfg(target_os = "linux")] - token: tokio_util::sync::CancellationToken, - #[cfg(target_os = "linux")] - vm_monitor: Option>>, -} - -fn wait_postgres(pg: Option) -> Result { - // Wait for the child Postgres process forever. In this state Ctrl+C will - // propagate to Postgres and it will be shut down as well. - let mut exit_code = None; - if let Some((mut pg, logs_handle)) = pg { - info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit"); - - let ecode = pg - .wait() - .expect("failed to start waiting on Postgres process"); - PG_PID.store(0, Ordering::SeqCst); - - // Process has exited. Wait for the log collecting task to finish. - let _ = tokio::runtime::Handle::current() - .block_on(logs_handle) - .map_err(|e| tracing::error!("log task panicked: {:?}", e)); - - info!("Postgres exited with code {}, shutting down", ecode); - exit_code = ecode.code() - } - - Ok(WaitPostgresResult { exit_code }) -} - -struct WaitPostgresResult { - exit_code: Option, -} - -fn cleanup_after_postgres_exit( - StartPostgresResult { - mut delay_exit, - compute, - #[cfg(target_os = "linux")] - vm_monitor, - #[cfg(target_os = "linux")] - token, - }: StartPostgresResult, -) -> Result { - // Terminate the vm_monitor so it releases the file watcher on - // /sys/fs/cgroup/neon-postgres. - // Note: the vm-monitor only runs on linux because it requires cgroups. - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - if let Some(handle) = vm_monitor { - // Kills all threads spawned by the monitor - token.cancel(); - // Kills the actual task running the monitor - handle.abort(); - } - } - } - - // Maybe sync safekeepers again, to speed up next startup - let compute_state = compute.state.lock().unwrap().clone(); - let pspec = compute_state.pspec.as_ref().expect("spec must be set"); - if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) { - info!("syncing safekeepers on shutdown"); - let storage_auth_token = pspec.storage_auth_token.clone(); - let lsn = compute.sync_safekeepers(storage_auth_token)?; - info!("synced safekeepers at lsn {lsn}"); - } - - let mut state = compute.state.lock().unwrap(); - if state.status == ComputeStatus::TerminationPending { - state.status = ComputeStatus::Terminated; - compute.state_changed.notify_all(); - // we were asked to terminate gracefully, don't exit to avoid restart - delay_exit = true - } - drop(state); - - if let Err(err) = compute.check_for_core_dumps() { - error!("error while checking for core dumps: {err:?}"); - } - - Ok(delay_exit) -} - -fn maybe_delay_exit(delay_exit: bool) { - // If launch failed, keep serving HTTP requests for a while, so the cloud - // control plane can get the actual error. - if delay_exit { - info!("giving control plane 30s to collect the error before shutdown"); - thread::sleep(Duration::from_secs(30)); - } -} - -fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! { +fn deinit_and_exit(exit_code: Option) -> ! { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. Shutting down OTEL tracing provider may // hang for quite some time, see, for example: diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 2a7f56e6fc..db3e07e086 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -58,14 +58,14 @@ pub async fn get_database_schema( compute: &Arc, dbname: &str, ) -> Result> + use<>, SchemaDumpError> { - let pgbin = &compute.pgbin; + let pgbin = &compute.params.pgbin; let basepath = Path::new(pgbin).parent().unwrap(); let pgdump = basepath.join("pg_dump"); // Replace the DB in the connection string and disable it to parts. // This is the only option to handle DBs with special characters. - let conf = - postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?; + let conf = postgres_conf_for_db(&compute.params.connstr, dbname) + .map_err(|_| SchemaDumpError::Unexpected)?; let host = conf .get_hosts() .first() diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index c0e28790d6..354528e2cd 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -11,8 +11,10 @@ use std::{env, fs}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; -use compute_api::responses::{ComputeMetrics, ComputeStatus}; -use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent}; +use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus}; +use compute_api::spec::{ + ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, +}; use futures::StreamExt; use futures::future::join_all; use futures::stream::FuturesUnordered; @@ -23,33 +25,59 @@ use postgres::NoTls; use postgres::error::SqlState; use remote_storage::{DownloadError, RemotePath}; use tokio::spawn; -use tracing::{debug, error, info, instrument, warn}; +use tracing::{Instrument, debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::measured_stream::MeasuredReader; +use crate::configurator::launch_configurator; +use crate::disk_quota::set_disk_quota; use crate::installed_extensions::get_installed_extensions; +use crate::logger::startup_context_from_env; +use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; +use crate::monitor::launch_monitor; use crate::pg_helpers::*; +use crate::rsyslog::configure_audit_rsyslog; use crate::spec::*; +use crate::swap::resize_swap; use crate::sync_sk::{check_if_synced, ping_safekeeper}; use crate::{config, extension_server, local_proxy}; pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); pub static PG_PID: AtomicU32 = AtomicU32::new(0); -/// Compute node info shared across several `compute_ctl` threads. -pub struct ComputeNode { +/// Static configuration params that don't change after startup. These mostly +/// come from the CLI args, or are derived from them. +pub struct ComputeNodeParams { /// The ID of the compute pub compute_id: String, // Url type maintains proper escaping pub connstr: url::Url, - // We connect to Postgres from many different places, so build configs once - // and reuse them where needed. - pub conn_conf: postgres::config::Config, - pub tokio_conn_conf: tokio_postgres::config::Config, + + pub resize_swap_on_bind: bool, + pub set_disk_quota_for_fs: Option, + + // VM monitor parameters + #[cfg(target_os = "linux")] + pub filecache_connstr: String, + #[cfg(target_os = "linux")] + pub cgroup: String, + #[cfg(target_os = "linux")] + pub vm_monitor_addr: String, + pub pgdata: String, pub pgbin: String, pub pgversion: String, + pub build_tag: String, + + /// The port that the compute's external HTTP server listens on + pub external_http_port: u16, + /// The port that the compute's internal HTTP server listens on + pub internal_http_port: u16, + + /// the address of extension storage proxy gateway + pub ext_remote_storage: Option, + /// We should only allow live re- / configuration of the compute node if /// it uses 'pull model', i.e. it can go to control-plane and fetch /// the latest configuration. Otherwise, there could be a case: @@ -63,10 +91,17 @@ pub struct ComputeNode { /// - we push spec and it does configuration /// - but then it is restarted without any spec again pub live_config_allowed: bool, - /// The port that the compute's external HTTP server listens on - pub external_http_port: u16, - /// The port that the compute's internal HTTP server listens on - pub internal_http_port: u16, +} + +/// Compute node info shared across several `compute_ctl` threads. +pub struct ComputeNode { + pub params: ComputeNodeParams, + + // We connect to Postgres from many different places, so build configs once + // and reuse them where needed. These are derived from 'params.connstr' + pub conn_conf: postgres::config::Config, + pub tokio_conn_conf: tokio_postgres::config::Config, + /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. /// To allow HTTP API server to serving status requests, while configuration /// is in progress, lock should be held only for short periods of time to do @@ -74,11 +109,9 @@ pub struct ComputeNode { pub state: Mutex, /// `Condvar` to allow notifying waiters about state changes. pub state_changed: Condvar, - /// the address of extension storage proxy gateway - pub ext_remote_storage: Option, + // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, - pub build_tag: String, } // store some metrics about download size that might impact startup time @@ -102,6 +135,8 @@ pub struct ComputeState { /// passed by the control plane with a /configure HTTP request. pub pspec: Option, + pub compute_ctl_config: ComputeCtlConfig, + /// If the spec is passed by a /configure request, 'startup_span' is the /// /configure request's tracing span. The main thread enters it when it /// processes the compute startup, so that the compute startup is considered @@ -125,6 +160,7 @@ impl ComputeState { last_active: None, error: None, pspec: None, + compute_ctl_config: ComputeCtlConfig::default(), startup_span: None, metrics: ComputeMetrics::default(), } @@ -242,80 +278,518 @@ fn maybe_cgexec(cmd: &str) -> Command { } } -pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String { - let roles = spec - .cluster - .roles - .iter() - .map(|r| escape_literal(&r.name)) - .collect::>(); +struct PostgresHandle { + postgres: std::process::Child, + log_collector: tokio::task::JoinHandle>, +} - let dbs = spec - .cluster - .databases - .iter() - .map(|db| escape_literal(&db.name)) - .collect::>(); +impl PostgresHandle { + /// Return PID of the postgres (postmaster) process + fn pid(&self) -> Pid { + Pid::from_raw(self.postgres.id() as i32) + } +} - let roles_decl = if roles.is_empty() { - String::from("roles text[] := NULL;") - } else { - format!( - r#" - roles text[] := ARRAY(SELECT rolname - FROM pg_catalog.pg_roles - WHERE rolname IN ({}));"#, - roles.join(", ") - ) - }; - - let database_decl = if dbs.is_empty() { - String::from("dbs text[] := NULL;") - } else { - format!( - r#" - dbs text[] := ARRAY(SELECT datname - FROM pg_catalog.pg_database - WHERE datname IN ({}));"#, - dbs.join(", ") - ) - }; - - // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases - // (see https://www.postgresql.org/docs/current/ddl-priv.html) - let query = format!( - r#" - DO $$ - DECLARE - r text; - {} - {} - BEGIN - IF NOT EXISTS ( - SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser') - THEN - CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data; - IF array_length(roles, 1) IS NOT NULL THEN - EXECUTE format('GRANT neon_superuser TO %s', - array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', ')); - FOREACH r IN ARRAY roles LOOP - EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r)); - END LOOP; - END IF; - IF array_length(dbs, 1) IS NOT NULL THEN - EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser', - array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', ')); - END IF; - END IF; - END - $$;"#, - roles_decl, database_decl, - ); - - query +struct StartVmMonitorResult { + #[cfg(target_os = "linux")] + token: tokio_util::sync::CancellationToken, + #[cfg(target_os = "linux")] + vm_monitor: Option>>, } impl ComputeNode { + pub fn new( + params: ComputeNodeParams, + cli_spec: Option, + compute_ctl_config: ComputeCtlConfig, + ) -> Result { + let connstr = params.connstr.as_str(); + let conn_conf = postgres::config::Config::from_str(connstr) + .context("cannot build postgres config from connstr")?; + let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) + .context("cannot build tokio postgres config from connstr")?; + + let mut new_state = ComputeState::new(); + if let Some(cli_spec) = cli_spec { + let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?; + new_state.pspec = Some(pspec); + } + new_state.compute_ctl_config = compute_ctl_config; + + Ok(ComputeNode { + params, + conn_conf, + tokio_conn_conf, + state: Mutex::new(new_state), + state_changed: Condvar::new(), + ext_download_progress: RwLock::new(HashMap::new()), + }) + } + + /// Top-level control flow of compute_ctl. Returns a process exit code we should + /// exit with. + pub fn run(self) -> Result> { + let this = Arc::new(self); + + let cli_spec = this.state.lock().unwrap().pspec.clone(); + + // If this is a pooled VM, prewarm before starting HTTP server and becoming + // available for binding. Prewarming helps Postgres start quicker later, + // because QEMU will already have its memory allocated from the host, and + // the necessary binaries will already be cached. + if cli_spec.is_none() { + this.prewarm_postgres()?; + } + + // Launch the external HTTP server first, so that we can serve control plane + // requests while configuration is still in progress. + crate::http::server::Server::External { + port: this.params.external_http_port, + jwks: this.state.lock().unwrap().compute_ctl_config.jwks.clone(), + compute_id: this.params.compute_id.clone(), + } + .launch(&this); + + // The internal HTTP server could be launched later, but there isn't much + // sense in waiting. + crate::http::server::Server::Internal { + port: this.params.internal_http_port, + } + .launch(&this); + + // If we got a spec from the CLI already, use that. Otherwise wait for the + // control plane to pass it to us with a /configure HTTP request + let pspec = if let Some(cli_spec) = cli_spec { + cli_spec + } else { + this.wait_spec()? + }; + + launch_lsn_lease_bg_task_for_static(&this); + + // We have a spec, start the compute + let mut delay_exit = false; + let mut vm_monitor = None; + let mut pg_process: Option = None; + + match this.start_compute(&mut pg_process) { + Ok(()) => { + // Success! Launch remaining services (just vm-monitor currently) + vm_monitor = + Some(this.start_vm_monitor(pspec.spec.disable_lfc_resizing.unwrap_or(false))); + } + Err(err) => { + // Something went wrong with the startup. Log it and expose the error to + // HTTP status requests. + error!("could not start the compute node: {:#}", err); + this.set_failed_status(err); + delay_exit = true; + + // If the error happened after starting PostgreSQL, kill it + if let Some(ref pg_process) = pg_process { + kill(pg_process.pid(), Signal::SIGQUIT).ok(); + } + } + } + + // If startup was successful, or it failed in the late stages, + // PostgreSQL is now running. Wait until it exits. + let exit_code = if let Some(pg_handle) = pg_process { + let exit_status = this.wait_postgres(pg_handle); + info!("Postgres exited with code {}, shutting down", exit_status); + exit_status.code() + } else { + None + }; + + // Terminate the vm_monitor so it releases the file watcher on + // /sys/fs/cgroup/neon-postgres. + // Note: the vm-monitor only runs on linux because it requires cgroups. + if let Some(vm_monitor) = vm_monitor { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + // Kills all threads spawned by the monitor + vm_monitor.token.cancel(); + if let Some(handle) = vm_monitor.vm_monitor { + // Kills the actual task running the monitor + handle.abort(); + } + } else { + _ = vm_monitor; // appease unused lint on macOS + } + } + } + + // Reap the postgres process + delay_exit |= this.cleanup_after_postgres_exit()?; + + // If launch failed, keep serving HTTP requests for a while, so the cloud + // control plane can get the actual error. + if delay_exit { + info!("giving control plane 30s to collect the error before shutdown"); + std::thread::sleep(Duration::from_secs(30)); + } + Ok(exit_code) + } + + pub fn wait_spec(&self) -> Result { + info!("no compute spec provided, waiting"); + let mut state = self.state.lock().unwrap(); + while state.status != ComputeStatus::ConfigurationPending { + state = self.state_changed.wait(state).unwrap(); + } + + info!("got spec, continue configuration"); + let spec = state.pspec.as_ref().unwrap().clone(); + + // Record for how long we slept waiting for the spec. + let now = Utc::now(); + state.metrics.wait_for_spec_ms = now + .signed_duration_since(state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + + // Reset start time, so that the total startup time that is calculated later will + // not include the time that we waited for the spec. + state.start_time = now; + + Ok(spec) + } + + /// Start compute. + /// + /// Prerequisites: + /// - the compute spec has been placed in self.state.pspec + /// + /// On success: + /// - status is set to ComputeStatus::Running + /// - self.running_postgres is set + /// + /// On error: + /// - status is left in ComputeStatus::Init. The caller is responsible for setting it to Failed + /// - if Postgres was started before the fatal error happened, self.running_postgres is + /// set. The caller is responsible for killing it. + /// + /// Note that this is in the critical path of a compute cold start. Keep this fast. + /// Try to do things concurrently, to hide the latencies. + fn start_compute(self: &Arc, pg_handle: &mut Option) -> Result<()> { + let compute_state: ComputeState; + + let start_compute_span; + let _this_entered; + { + let mut state_guard = self.state.lock().unwrap(); + + // Create a tracing span for the startup operation. + // + // We could otherwise just annotate the function with #[instrument], but if + // we're being configured from a /configure HTTP request, we want the + // startup to be considered part of the /configure request. + // + // Similarly, if a trace ID was passed in env variables, attach it to the span. + start_compute_span = { + // Temporarily enter the parent span, so that the new span becomes its child. + if let Some(p) = state_guard.startup_span.take() { + let _parent_entered = p.entered(); + tracing::info_span!("start_compute") + } else if let Some(otel_context) = startup_context_from_env() { + use tracing_opentelemetry::OpenTelemetrySpanExt; + let span = tracing::info_span!("start_compute"); + span.set_parent(otel_context); + span + } else { + tracing::info_span!("start_compute") + } + }; + _this_entered = start_compute_span.enter(); + + state_guard.set_status(ComputeStatus::Init, &self.state_changed); + compute_state = state_guard.clone() + } + + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + info!( + "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}", + pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), + pspec.spec.operation_uuid.as_deref().unwrap_or("None"), + pspec.tenant_id, + pspec.timeline_id, + pspec.spec.features, + pspec.spec.remote_extensions, + ); + + ////// PRE-STARTUP PHASE: things that need to be finished before we start the Postgres process + + // Collect all the tasks that must finish here + let mut pre_tasks = tokio::task::JoinSet::new(); + + // If there are any remote extensions in shared_preload_libraries, start downloading them + if pspec.spec.remote_extensions.is_some() { + let (this, spec) = (self.clone(), pspec.spec.clone()); + pre_tasks.spawn(async move { + this.download_preload_extensions(&spec) + .in_current_span() + .await + }); + } + + // Prepare pgdata directory. This downloads the basebackup, among other things. + { + let (this, cs) = (self.clone(), compute_state.clone()); + pre_tasks.spawn_blocking_child(move || this.prepare_pgdata(&cs)); + } + + // Resize swap to the desired size if the compute spec says so + if let (Some(size_bytes), true) = + (pspec.spec.swap_size_bytes, self.params.resize_swap_on_bind) + { + pre_tasks.spawn_blocking_child(move || { + // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion + // *before* starting postgres. + // + // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this + // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets + // OOM-killed during startup because swap wasn't available yet. + resize_swap(size_bytes).context("failed to resize swap")?; + let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%size_bytes, %size_mib, "resized swap"); + + Ok::<(), anyhow::Error>(()) + }); + } + + // Set disk quota if the compute spec says so + if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = ( + pspec.spec.disk_quota_bytes, + self.params.set_disk_quota_for_fs.as_ref(), + ) { + let disk_quota_fs_mountpoint = disk_quota_fs_mountpoint.clone(); + pre_tasks.spawn_blocking_child(move || { + set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) + .context("failed to set disk quota")?; + let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%disk_quota_bytes, %size_mib, "set disk quota"); + + Ok::<(), anyhow::Error>(()) + }); + } + + // tune pgbouncer + if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { + info!("tuning pgbouncer"); + + // Spawn a background task to do the tuning, + // so that we don't block the main thread that starts Postgres. + let pgbouncer_settings = pgbouncer_settings.clone(); + let _handle = tokio::spawn(async move { + let res = tune_pgbouncer(pgbouncer_settings).await; + if let Err(err) = res { + error!("error while tuning pgbouncer: {err:?}"); + // Continue with the startup anyway + } + }); + } + + // configure local_proxy + if let Some(local_proxy) = &pspec.spec.local_proxy_config { + info!("configuring local_proxy"); + + // Spawn a background task to do the configuration, + // so that we don't block the main thread that starts Postgres. + let local_proxy = local_proxy.clone(); + let _handle = tokio::spawn(async move { + if let Err(err) = local_proxy::configure(&local_proxy) { + error!("error while configuring local_proxy: {err:?}"); + // Continue with the startup anyway + } + }); + } + + // Configure and start rsyslog if necessary + if let ComputeAudit::Hipaa = pspec.spec.audit_log_level { + let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); + if remote_endpoint.is_empty() { + anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + } + + let log_directory_path = Path::new(&self.params.pgdata).join("log"); + // TODO: make this more robust + // now rsyslog starts once and there is no monitoring or restart if it fails + configure_audit_rsyslog( + log_directory_path.to_str().unwrap(), + "hipaa", + &remote_endpoint, + )?; + } + + // Launch remaining service threads + let _monitor_handle = launch_monitor(self); + let _configurator_handle = launch_configurator(self); + + // Wait for all the pre-tasks to finish before starting postgres + let rt = tokio::runtime::Handle::current(); + while let Some(res) = rt.block_on(pre_tasks.join_next()) { + res??; + } + + ////// START POSTGRES + let start_time = Utc::now(); + let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; + let postmaster_pid = pg_process.pid(); + *pg_handle = Some(pg_process); + + // If this is a primary endpoint, perform some post-startup configuration before + // opening it up for the world. + let config_time = Utc::now(); + if pspec.spec.mode == ComputeMode::Primary { + self.configure_as_primary(&compute_state)?; + + let conf = self.get_conn_conf(None); + tokio::task::spawn_blocking(|| { + let res = get_installed_extensions(conf); + match res { + Ok(extensions) => { + info!( + "[NEON_EXT_STAT] {}", + serde_json::to_string(&extensions) + .expect("failed to serialize extensions list") + ); + } + Err(err) => error!("could not get installed extensions: {err:?}"), + } + }); + } + + // All done! + let startup_end_time = Utc::now(); + let metrics = { + let mut state = self.state.lock().unwrap(); + state.metrics.start_postgres_ms = config_time + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64; + state.metrics.config_ms = startup_end_time + .signed_duration_since(config_time) + .to_std() + .unwrap() + .as_millis() as u64; + state.metrics.total_startup_ms = startup_end_time + .signed_duration_since(compute_state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + state.metrics.clone() + }; + self.set_status(ComputeStatus::Running); + + // Log metrics so that we can search for slow operations in logs + info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished"); + + Ok(()) + } + + #[instrument(skip_all)] + async fn download_preload_extensions(&self, spec: &ComputeSpec) -> Result<()> { + let remote_extensions = if let Some(remote_extensions) = &spec.remote_extensions { + remote_extensions + } else { + return Ok(()); + }; + + // First, create control files for all available extensions + extension_server::create_control_files(remote_extensions, &self.params.pgbin); + + let library_load_start_time = Utc::now(); + let remote_ext_metrics = self.prepare_preload_libraries(spec).await?; + + let library_load_time = Utc::now() + .signed_duration_since(library_load_start_time) + .to_std() + .unwrap() + .as_millis() as u64; + let mut state = self.state.lock().unwrap(); + state.metrics.load_ext_ms = library_load_time; + state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded; + state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size; + state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size; + info!( + "Loading shared_preload_libraries took {:?}ms", + library_load_time + ); + info!("{:?}", remote_ext_metrics); + + Ok(()) + } + + /// Start the vm-monitor if directed to. The vm-monitor only runs on linux + /// because it requires cgroups. + fn start_vm_monitor(&self, disable_lfc_resizing: bool) -> StartVmMonitorResult { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + use std::env; + use tokio_util::sync::CancellationToken; + + // This token is used internally by the monitor to clean up all threads + let token = CancellationToken::new(); + + // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC + let pgconnstr = if disable_lfc_resizing { + None + } else { + Some(self.params.filecache_connstr.clone()) + }; + + let vm_monitor = if env::var_os("AUTOSCALING").is_some() { + let vm_monitor = tokio::spawn(vm_monitor::start( + Box::leak(Box::new(vm_monitor::Args { + cgroup: Some(self.params.cgroup.clone()), + pgconnstr, + addr: self.params.vm_monitor_addr.clone(), + })), + token.clone(), + )); + Some(vm_monitor) + } else { + None + }; + StartVmMonitorResult { token, vm_monitor } + } else { + _ = disable_lfc_resizing; // appease unused lint on macOS + StartVmMonitorResult { } + } + } + } + + fn cleanup_after_postgres_exit(&self) -> Result { + // Maybe sync safekeepers again, to speed up next startup + let compute_state = self.state.lock().unwrap().clone(); + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) { + info!("syncing safekeepers on shutdown"); + let storage_auth_token = pspec.storage_auth_token.clone(); + let lsn = self.sync_safekeepers(storage_auth_token)?; + info!("synced safekeepers at lsn {lsn}"); + } + + let mut delay_exit = false; + let mut state = self.state.lock().unwrap(); + if state.status == ComputeStatus::TerminationPending { + state.status = ComputeStatus::Terminated; + self.state_changed.notify_all(); + // we were asked to terminate gracefully, don't exit to avoid restart + delay_exit = true + } + drop(state); + + if let Err(err) = self.check_for_core_dumps() { + error!("error while checking for core dumps: {err:?}"); + } + + Ok(delay_exit) + } + /// Check that compute node has corresponding feature enabled. pub fn has_feature(&self, feature: ComputeFeature) -> bool { let state = self.state.lock().unwrap(); @@ -354,9 +828,10 @@ impl ComputeNode { fn create_pgdata(&self) -> Result<()> { // Ignore removal error, likely it is a 'No such file or directory (os error 2)'. // If it is something different then create_dir() will error out anyway. - let _ok = fs::remove_dir_all(&self.pgdata); - fs::create_dir(&self.pgdata)?; - fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?; + let pgdata = &self.params.pgdata; + let _ok = fs::remove_dir_all(pgdata); + fs::create_dir(pgdata)?; + fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?; Ok(()) } @@ -421,7 +896,7 @@ impl ComputeNode { // sends an Error after finishing the tarball, we will not notice it. let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader)); ar.set_ignore_zeros(true); - ar.unpack(&self.pgdata)?; + ar.unpack(&self.params.pgdata)?; // Report metrics let mut state = self.state.lock().unwrap(); @@ -566,9 +1041,9 @@ impl ComputeNode { pub fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { let start_time = Utc::now(); - let mut sync_handle = maybe_cgexec(&self.pgbin) + let mut sync_handle = maybe_cgexec(&self.params.pgbin) .args(["--sync-safekeepers"]) - .env("PGDATA", &self.pgdata) // we cannot use -D in this mode + .env("PGDATA", &self.params.pgdata) // we cannot use -D in this mode .envs(if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN", storage_auth_token)] } else { @@ -625,14 +1100,14 @@ impl ComputeNode { pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = &pspec.spec; - let pgdata_path = Path::new(&self.pgdata); + let pgdata_path = Path::new(&self.params.pgdata); // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf( &pgdata_path.join("postgresql.conf"), &pspec.spec, - self.internal_http_port, + self.params.internal_http_port, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -732,12 +1207,15 @@ impl ComputeNode { info!("prewarming"); // Create pgdata - let pgdata = &format!("{}.warmup", self.pgdata); + let pgdata = &format!("{}.warmup", self.params.pgdata); create_pgdata(pgdata)?; // Run initdb to completion info!("running initdb"); - let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb"); + let initdb_bin = Path::new(&self.params.pgbin) + .parent() + .unwrap() + .join("initdb"); Command::new(initdb_bin) .args(["--pgdata", pgdata]) .output() @@ -753,7 +1231,7 @@ impl ComputeNode { // Start postgres info!("starting postgres"); - let mut pg = maybe_cgexec(&self.pgbin) + let mut pg = maybe_cgexec(&self.params.pgbin) .args(["-D", pgdata]) .spawn() .expect("cannot start postgres process"); @@ -780,15 +1258,12 @@ impl ComputeNode { /// /// Returns a handle to the child process and a handle to the logs thread. #[instrument(skip_all)] - pub fn start_postgres( - &self, - storage_auth_token: Option, - ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { - let pgdata_path = Path::new(&self.pgdata); + pub fn start_postgres(&self, storage_auth_token: Option) -> Result { + let pgdata_path = Path::new(&self.params.pgdata); // Run postgres as a child process. - let mut pg = maybe_cgexec(&self.pgbin) - .args(["-D", &self.pgdata]) + let mut pg = maybe_cgexec(&self.params.pgbin) + .args(["-D", &self.params.pgdata]) .envs(if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN", storage_auth_token)] } else { @@ -805,7 +1280,29 @@ impl ComputeNode { wait_for_postgres(&mut pg, pgdata_path)?; - Ok((pg, logs_handle)) + Ok(PostgresHandle { + postgres: pg, + log_collector: logs_handle, + }) + } + + /// Wait for the child Postgres process forever. In this state Ctrl+C will + /// propagate to Postgres and it will be shut down as well. + fn wait_postgres(&self, mut pg_handle: PostgresHandle) -> std::process::ExitStatus { + info!(postmaster_pid = %pg_handle.postgres.id(), "Waiting for Postgres to exit"); + + let ecode = pg_handle + .postgres + .wait() + .expect("failed to start waiting on Postgres process"); + PG_PID.store(0, Ordering::SeqCst); + + // Process has exited. Wait for the log collecting task to finish. + let _ = tokio::runtime::Handle::current() + .block_on(pg_handle.log_collector) + .map_err(|e| tracing::error!("log task panicked: {:?}", e)); + + ecode } /// Do post configuration of the already started Postgres. This function spawns a background task to @@ -972,9 +1469,12 @@ impl ComputeNode { // `pg_ctl` for start / stop. #[instrument(skip_all)] fn pg_reload_conf(&self) -> Result<()> { - let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl"); + let pgctl_bin = Path::new(&self.params.pgbin) + .parent() + .unwrap() + .join("pg_ctl"); Command::new(pgctl_bin) - .args(["reload", "-D", &self.pgdata]) + .args(["reload", "-D", &self.params.pgdata]) .output() .expect("cannot run pg_ctl process"); Ok(()) @@ -1014,9 +1514,9 @@ impl ComputeNode { } // Write new config - let pgdata_path = Path::new(&self.pgdata); + let pgdata_path = Path::new(&self.params.pgdata); let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?; + config::write_postgres_conf(&postgresql_conf_path, &spec, self.params.internal_http_port)?; if !spec.skip_pg_catalog_updates { let max_concurrent_connections = spec.reconfigure_concurrency; @@ -1027,7 +1527,8 @@ impl ComputeNode { self.pg_reload_conf()?; if spec.mode == ComputeMode::Primary { - let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); + let mut conf = + tokio_postgres::Config::from_str(self.params.connstr.as_str()).unwrap(); conf.application_name("apply_config"); let conf = Arc::new(conf); @@ -1053,166 +1554,37 @@ impl ComputeNode { } #[instrument(skip_all)] - pub fn start_compute( - &self, - ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { - let compute_state = self.state.lock().unwrap().clone(); + pub fn configure_as_primary(&self, compute_state: &ComputeState) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); - info!( - "starting compute for project {}, operation {}, tenant {}, timeline {}", - pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), - pspec.spec.operation_uuid.as_deref().unwrap_or("None"), - pspec.tenant_id, - pspec.timeline_id, - ); - // tune pgbouncer - if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { - info!("tuning pgbouncer"); - - // Spawn a background task to do the tuning, - // so that we don't block the main thread that starts Postgres. - let pgbouncer_settings = pgbouncer_settings.clone(); - let _handle = tokio::spawn(async move { - let res = tune_pgbouncer(pgbouncer_settings).await; - if let Err(err) = res { - error!("error while tuning pgbouncer: {err:?}"); - } - }); - } - - if let Some(local_proxy) = &pspec.spec.local_proxy_config { - info!("configuring local_proxy"); - - // Spawn a background task to do the configuration, - // so that we don't block the main thread that starts Postgres. - let local_proxy = local_proxy.clone(); - let _handle = tokio::spawn(async move { - if let Err(err) = local_proxy::configure(&local_proxy) { - error!("error while configuring local_proxy: {err:?}"); - } - }); - } - - info!( - "start_compute spec.remote_extensions {:?}", - pspec.spec.remote_extensions - ); - - // This part is sync, because we need to download - // remote shared_preload_libraries before postgres start (if any) - if let Some(remote_extensions) = &pspec.spec.remote_extensions { - // First, create control files for all availale extensions - extension_server::create_control_files(remote_extensions, &self.pgbin); - - let library_load_start_time = Utc::now(); - let rt = tokio::runtime::Handle::current(); - let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?; - - let library_load_time = Utc::now() - .signed_duration_since(library_load_start_time) - .to_std() - .unwrap() - .as_millis() as u64; - let mut state = self.state.lock().unwrap(); - state.metrics.load_ext_ms = library_load_time; - state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded; - state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size; - state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size; - info!( - "Loading shared_preload_libraries took {:?}ms", - library_load_time - ); - info!("{:?}", remote_ext_metrics); - } - - self.prepare_pgdata(&compute_state)?; - - let start_time = Utc::now(); - let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; - - let config_time = Utc::now(); - if pspec.spec.mode == ComputeMode::Primary { - if !pspec.spec.skip_pg_catalog_updates { - let pgdata_path = Path::new(&self.pgdata); - // temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are applying config: - // creating new extensions, roles, etc... - config::with_compute_ctl_tmp_override( - pgdata_path, - "neon.max_cluster_size=-1", - || { - self.pg_reload_conf()?; - - self.apply_config(&compute_state)?; - - Ok(()) - }, - )?; - - let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - if config::line_in_file( - &postgresql_conf_path, - "neon.disable_logical_replication_subscribers=false", - )? { - info!( - "updated postgresql.conf to set neon.disable_logical_replication_subscribers=false" - ); - } + assert!(pspec.spec.mode == ComputeMode::Primary); + if !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.params.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { self.pg_reload_conf()?; + + self.apply_config(compute_state)?; + + Ok(()) + })?; + + let postgresql_conf_path = pgdata_path.join("postgresql.conf"); + if config::line_in_file( + &postgresql_conf_path, + "neon.disable_logical_replication_subscribers=false", + )? { + info!( + "updated postgresql.conf to set neon.disable_logical_replication_subscribers=false" + ); } - self.post_apply_config()?; - - let conf = self.get_conn_conf(None); - tokio::task::spawn_blocking(|| { - let res = get_installed_extensions(conf); - match res { - Ok(extensions) => { - info!( - "[NEON_EXT_STAT] {}", - serde_json::to_string(&extensions) - .expect("failed to serialize extensions list") - ); - } - Err(err) => error!("could not get installed extensions: {err:?}"), - } - }); + self.pg_reload_conf()?; } + self.post_apply_config()?; - let startup_end_time = Utc::now(); - { - let mut state = self.state.lock().unwrap(); - state.metrics.start_postgres_ms = config_time - .signed_duration_since(start_time) - .to_std() - .unwrap() - .as_millis() as u64; - state.metrics.config_ms = startup_end_time - .signed_duration_since(config_time) - .to_std() - .unwrap() - .as_millis() as u64; - state.metrics.total_startup_ms = startup_end_time - .signed_duration_since(compute_state.start_time) - .to_std() - .unwrap() - .as_millis() as u64; - } - self.set_status(ComputeStatus::Running); - - info!( - "finished configuration of compute for project {}", - pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None") - ); - - // Log metrics so that we can search for slow operations in logs - let metrics = { - let state = self.state.lock().unwrap(); - state.metrics.clone() - }; - info!(?metrics, "compute start finished"); - - Ok(pg_process) + Ok(()) } /// Update the `last_active` in the shared state, but ensure that it's a more recent one. @@ -1241,7 +1613,7 @@ impl ComputeNode { pub fn check_for_core_dumps(&self) -> Result<()> { let core_dump_dir = match std::env::consts::OS { "macos" => Path::new("/cores/"), - _ => Path::new(&self.pgdata), + _ => Path::new(&self.params.pgdata), }; // Collect core dump paths if any @@ -1271,7 +1643,7 @@ impl ComputeNode { // Try first with gdb let backtrace = Command::new("gdb") - .args(["--batch", "-q", "-ex", "bt", &self.pgbin]) + .args(["--batch", "-q", "-ex", "bt", &self.params.pgbin]) .arg(&core_path) .output(); @@ -1348,7 +1720,8 @@ LIMIT 100", ext_path: RemotePath, ) -> Result { let ext_remote_storage = - self.ext_remote_storage + self.params + .ext_remote_storage .as_ref() .ok_or(DownloadError::BadInput(anyhow::anyhow!( "Remote extensions storage is not configured", @@ -1411,7 +1784,7 @@ LIMIT 100", &real_ext_name, &ext_path, ext_remote_storage, - &self.pgbin, + &self.params.pgbin, ) .await .map_err(DownloadError::Other); @@ -1519,7 +1892,7 @@ LIMIT 100", &self, spec: &ComputeSpec, ) -> Result { - if self.ext_remote_storage.is_none() { + if self.params.ext_remote_storage.is_none() { return Ok(RemoteExtensionMetrics { num_ext_downloaded: 0, largest_ext_size: 0, @@ -1570,8 +1943,12 @@ LIMIT 100", let mut download_tasks = Vec::new(); for library in &libs_vec { - let (ext_name, ext_path) = - remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?; + let (ext_name, ext_path) = remote_extensions.get_ext( + library, + true, + &self.params.build_tag, + &self.params.pgversion, + )?; download_tasks.push(self.download_extension(ext_name, ext_path)); } let results = join_all(download_tasks).await; @@ -1648,3 +2025,26 @@ pub fn forward_termination_signal() { kill(pg_pid, Signal::SIGINT).ok(); } } + +// helper trait to call JoinSet::spawn_blocking(f), but propagates the current +// tracing span to the thread. +trait JoinSetExt { + fn spawn_blocking_child(&mut self, f: F) -> tokio::task::AbortHandle + where + F: FnOnce() -> T + Send + 'static, + T: Send; +} + +impl JoinSetExt for tokio::task::JoinSet { + fn spawn_blocking_child(&mut self, f: F) -> tokio::task::AbortHandle + where + F: FnOnce() -> T + Send + 'static, + T: Send, + { + let sp = tracing::Span::current(); + self.spawn_blocking(move || { + let _e = sp.enter(); + f() + }) + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index e8056ec7eb..0760568ff8 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -1,12 +1,16 @@ +use anyhow::Result; +use std::fmt::Write as FmtWrite; use std::fs::{File, OpenOptions}; use std::io; +use std::io::Write; use std::io::prelude::*; use std::path::Path; -use anyhow::Result; -use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; +use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption}; -use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize, escape_conf_value}; +use crate::pg_helpers::{ + GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value, +}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -55,10 +59,20 @@ pub fn write_postgres_conf( writeln!(file, "neon.stripe_size={stripe_size}")?; } if !spec.safekeeper_connstrings.is_empty() { + let mut neon_safekeepers_value = String::new(); + tracing::info!( + "safekeepers_connstrings is not zero, gen: {:?}", + spec.safekeepers_generation + ); + // If generation is given, prepend sk list with g#number: + if let Some(generation) = spec.safekeepers_generation { + write!(neon_safekeepers_value, "g#{}:", generation)?; + } + neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(",")); writeln!( file, "neon.safekeepers={}", - escape_conf_value(&spec.safekeeper_connstrings.join(",")) + escape_conf_value(&neon_safekeepers_value) )?; } if let Some(s) = &spec.tenant_id { @@ -126,6 +140,54 @@ pub fn write_postgres_conf( writeln!(file, "# Managed by compute_ctl: end")?; } + // If audit logging is enabled, configure pgaudit. + // + // Note, that this is called after the settings from spec are written. + // This way we always override the settings from the spec + // and don't allow the user or the control plane admin to change them. + if let ComputeAudit::Hipaa = spec.audit_log_level { + writeln!(file, "# Managed by compute_ctl audit settings: begin")?; + // This log level is very verbose + // but this is necessary for HIPAA compliance. + writeln!(file, "pgaudit.log='all'")?; + writeln!(file, "pgaudit.log_parameter=on")?; + // Disable logging of catalog queries + // The catalog doesn't contain sensitive data, so we don't need to audit it. + writeln!(file, "pgaudit.log_catalog=off")?; + // Set log rotation to 5 minutes + // TODO: tune this after performance testing + writeln!(file, "pgaudit.log_rotation_age=5")?; + + // Add audit shared_preload_libraries, if they are not present. + // + // The caller who sets the flag is responsible for ensuring that the necessary + // shared_preload_libraries are present in the compute image, + // otherwise the compute start will fail. + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + let mut extra_shared_preload_libraries = String::new(); + if !libs.contains("pgaudit") { + extra_shared_preload_libraries.push_str(",pgaudit"); + } + if !libs.contains("pgauditlogtofile") { + extra_shared_preload_libraries.push_str(",pgauditlogtofile"); + } + writeln!( + file, + "shared_preload_libraries='{}{}'", + libs, extra_shared_preload_libraries + )?; + } else { + // Typically, this should be unreacheable, + // because we always set at least some shared_preload_libraries in the spec + // but let's handle it explicitly anyway. + writeln!( + file, + "shared_preload_libraries='neon,pgaudit,pgauditlogtofile'" + )?; + } + writeln!(file, "# Managed by compute_ctl audit settings: end")?; + } + writeln!(file, "neon.extension_server_port={}", extension_server_port)?; if spec.drop_subscriptions_before_start { diff --git a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf new file mode 100644 index 0000000000..bef3c36446 --- /dev/null +++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf @@ -0,0 +1,10 @@ +# Load imfile module to read log files +module(load="imfile") + +# Input configuration for log files in the specified directory +# Replace {log_directory} with the directory containing the log files +input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0") +global(workDirectory="/var/log") + +# Forward logs to remote syslog server +*.* @@{remote_endpoint} \ No newline at end of file diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 77e98359ab..b4de786b00 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -253,27 +253,31 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { } } -// Do request to extension storage proxy, i.e. +// Do request to extension storage proxy, e.g., // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst -// using HHTP GET -// and return the response body as bytes -// +// using HTTP GET and return the response body as bytes. async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result { let uri = format!("{}/{}", ext_remote_storage, ext_path); + let filename = Path::new(ext_path) + .file_name() + .unwrap_or_else(|| std::ffi::OsStr::new("unknown")) + .to_str() + .unwrap_or("unknown") + .to_string(); - info!("Download extension {} from uri {}", ext_path, uri); + info!("Downloading extension file '{}' from uri {}", filename, uri); match do_extension_server_request(&uri).await { Ok(resp) => { info!("Successfully downloaded remote extension data {}", ext_path); REMOTE_EXT_REQUESTS_TOTAL - .with_label_values(&[&StatusCode::OK.to_string()]) + .with_label_values(&[&StatusCode::OK.to_string(), &filename]) .inc(); Ok(resp) } Err((msg, status)) => { REMOTE_EXT_REQUESTS_TOTAL - .with_label_values(&[&status]) + .with_label_values(&[&status, &filename]) .inc(); bail!(msg); } diff --git a/compute_tools/src/http/extract/mod.rs b/compute_tools/src/http/extract/mod.rs index 1b690e444d..589681cfe2 100644 --- a/compute_tools/src/http/extract/mod.rs +++ b/compute_tools/src/http/extract/mod.rs @@ -1,7 +1,9 @@ pub(crate) mod json; pub(crate) mod path; pub(crate) mod query; +pub(crate) mod request_id; pub(crate) use json::Json; pub(crate) use path::Path; pub(crate) use query::Query; +pub(crate) use request_id::RequestId; diff --git a/compute_tools/src/http/extract/request_id.rs b/compute_tools/src/http/extract/request_id.rs new file mode 100644 index 0000000000..d911921a05 --- /dev/null +++ b/compute_tools/src/http/extract/request_id.rs @@ -0,0 +1,86 @@ +use std::{ + fmt::Display, + ops::{Deref, DerefMut}, +}; + +use axum::{extract::FromRequestParts, response::IntoResponse}; +use http::{StatusCode, request::Parts}; + +use crate::http::{JsonResponse, headers::X_REQUEST_ID}; + +/// Extract the request ID from the `X-Request-Id` header. +#[derive(Debug, Clone, Default)] +pub(crate) struct RequestId(pub String); + +#[derive(Debug)] +/// Rejection used for [`RequestId`]. +/// +/// Contains one variant for each way the [`RequestId`] extractor can +/// fail. +pub(crate) enum RequestIdRejection { + /// The request is missing the header. + MissingRequestId, + + /// The value of the header is invalid UTF-8. + InvalidUtf8, +} + +impl RequestIdRejection { + pub fn status(&self) -> StatusCode { + match self { + RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR, + RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST, + } + } + + pub fn message(&self) -> String { + match self { + RequestIdRejection::MissingRequestId => "request ID is missing", + RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8", + } + .to_string() + } +} + +impl IntoResponse for RequestIdRejection { + fn into_response(self) -> axum::response::Response { + JsonResponse::error(self.status(), self.message()) + } +} + +impl FromRequestParts for RequestId +where + S: Send + Sync, +{ + type Rejection = RequestIdRejection; + + async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result { + match parts.headers.get(X_REQUEST_ID) { + Some(value) => match value.to_str() { + Ok(request_id) => Ok(Self(request_id.to_string())), + Err(_) => Err(RequestIdRejection::InvalidUtf8), + }, + None => Err(RequestIdRejection::MissingRequestId), + } + } +} + +impl Deref for RequestId { + type Target = String; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for RequestId { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl Display for RequestId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(&self.0) + } +} diff --git a/compute_tools/src/http/headers.rs b/compute_tools/src/http/headers.rs new file mode 100644 index 0000000000..a11638e203 --- /dev/null +++ b/compute_tools/src/http/headers.rs @@ -0,0 +1,2 @@ +/// Constant for `X-Request-Id` header. +pub const X_REQUEST_ID: &str = "x-request-id"; diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs new file mode 100644 index 0000000000..798dd1179b --- /dev/null +++ b/compute_tools/src/http/middleware/authorize.rs @@ -0,0 +1,145 @@ +use std::{collections::HashSet, net::SocketAddr}; + +use anyhow::{Result, anyhow}; +use axum::{RequestExt, body::Body, extract::ConnectInfo}; +use axum_extra::{ + TypedHeader, + headers::{Authorization, authorization::Bearer}, +}; +use futures::future::BoxFuture; +use http::{Request, Response, StatusCode}; +use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet}; +use serde::Deserialize; +use tower_http::auth::AsyncAuthorizeRequest; +use tracing::warn; + +use crate::http::{JsonResponse, extract::RequestId}; + +#[derive(Clone, Debug, Deserialize)] +pub(in crate::http) struct Claims { + compute_id: String, +} + +#[derive(Clone, Debug)] +pub(in crate::http) struct Authorize { + compute_id: String, + jwks: JwkSet, + validation: Validation, +} + +impl Authorize { + pub fn new(compute_id: String, jwks: JwkSet) -> Self { + let mut validation = Validation::new(Algorithm::EdDSA); + // Nothing is currently required + validation.required_spec_claims = HashSet::new(); + validation.validate_exp = true; + // Unused by the control plane + validation.validate_aud = false; + // Unused by the control plane + validation.validate_nbf = false; + + Self { + compute_id, + jwks, + validation, + } + } +} + +impl AsyncAuthorizeRequest for Authorize { + type RequestBody = Body; + type ResponseBody = Body; + type Future = BoxFuture<'static, Result, Response>>; + + fn authorize(&mut self, mut request: Request) -> Self::Future { + let compute_id = self.compute_id.clone(); + let jwks = self.jwks.clone(); + let validation = self.validation.clone(); + + Box::pin(async move { + let request_id = request.extract_parts::().await.unwrap(); + + // TODO: Remove this check after a successful rollout + if jwks.keys.is_empty() { + warn!(%request_id, "Authorization has not been configured"); + + return Ok(request); + } + + let connect_info = request + .extract_parts::>() + .await + .unwrap(); + + // In the event the request is coming from the loopback interface, + // allow all requests + if connect_info.ip().is_loopback() { + warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface"); + + return Ok(request); + } + + let TypedHeader(Authorization(bearer)) = request + .extract_parts::>>() + .await + .map_err(|_| { + JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token") + })?; + + let data = match Self::verify(&jwks, bearer.token(), &validation) { + Ok(claims) => claims, + Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)), + }; + + if data.claims.compute_id != compute_id { + return Err(JsonResponse::error( + StatusCode::UNAUTHORIZED, + "invalid claims in authorization token", + )); + } + + // Make claims available to any subsequent middleware or request + // handlers + request.extensions_mut().insert(data.claims); + + Ok(request) + }) + } +} + +impl Authorize { + /// Verify the token using the JSON Web Key set and return the token data. + fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result> { + debug_assert!(!jwks.keys.is_empty()); + + for jwk in jwks.keys.iter() { + let decoding_key = match DecodingKey::from_jwk(jwk) { + Ok(key) => key, + Err(e) => { + warn!( + "Failed to construct decoding key from {}: {}", + jwk.common.key_id.as_ref().unwrap(), + e + ); + + continue; + } + }; + + match jsonwebtoken::decode::(token, &decoding_key, validation) { + Ok(data) => return Ok(data), + Err(e) => { + warn!( + "Failed to decode authorization token using {}: {}", + jwk.common.key_id.as_ref().unwrap(), + e + ); + + continue; + } + } + } + + Err(anyhow!("Failed to verify authorization token")) + } +} diff --git a/compute_tools/src/http/middleware/mod.rs b/compute_tools/src/http/middleware/mod.rs new file mode 100644 index 0000000000..caeeeedfe5 --- /dev/null +++ b/compute_tools/src/http/middleware/mod.rs @@ -0,0 +1 @@ +pub(in crate::http) mod authorize; diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index d182278174..9ecc1b0093 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -7,6 +7,8 @@ use serde::Serialize; use tracing::error; mod extract; +mod headers; +mod middleware; mod routes; pub mod server; diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index 63d428fff4..3c5a6a6d41 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -22,7 +22,7 @@ pub(in crate::http) async fn configure( State(compute): State>, request: Json, ) -> Response { - if !compute.live_config_allowed { + if !compute.params.live_config_allowed { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "live configuration is not allowed for this compute node".to_string(), diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index b0265d1e99..563b73ae65 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -18,11 +18,11 @@ pub(in crate::http) struct ExtensionServerParams { /// Download a remote extension. pub(in crate::http) async fn download_extension( Path(filename): Path, - params: Query, + ext_server_params: Query, State(compute): State>, ) -> Response { // Don't even try to download extensions if no remote storage is configured - if compute.ext_remote_storage.is_none() { + if compute.params.ext_remote_storage.is_none() { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "remote storage is not configured", @@ -46,9 +46,9 @@ pub(in crate::http) async fn download_extension( remote_extensions.get_ext( &filename, - params.is_library, - &compute.build_tag, - &compute.pgversion, + ext_server_params.is_library, + &compute.params.build_tag, + &compute.params.pgversion, ) }; diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 7283401bb5..126fa86d1c 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -10,48 +10,58 @@ use axum::middleware::{self, Next}; use axum::response::{IntoResponse, Response}; use axum::routing::{get, post}; use http::StatusCode; +use jsonwebtoken::jwk::JwkSet; use tokio::net::TcpListener; use tower::ServiceBuilder; -use tower_http::request_id::PropagateRequestIdLayer; -use tower_http::trace::TraceLayer; -use tracing::{Span, debug, error, info}; +use tower_http::{ + auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer, +}; +use tracing::{Span, error, info}; use uuid::Uuid; -use super::routes::{ - check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, - grants, insights, metrics, metrics_json, status, terminate, +use super::{ + headers::X_REQUEST_ID, + middleware::authorize::Authorize, + routes::{ + check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, + grants, insights, metrics, metrics_json, status, terminate, + }, }; use crate::compute::ComputeNode; -const X_REQUEST_ID: &str = "x-request-id"; - /// `compute_ctl` has two servers: internal and external. The internal server /// binds to the loopback interface and handles communication from clients on /// the compute. The external server is what receives communication from the /// control plane, the metrics scraper, etc. We make the distinction because /// certain routes in `compute_ctl` only need to be exposed to local processes /// like Postgres via the neon extension and local_proxy. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Debug)] pub enum Server { - Internal(u16), - External(u16), + Internal { + port: u16, + }, + External { + port: u16, + jwks: JwkSet, + compute_id: String, + }, } impl Display for Server { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Server::Internal(_) => f.write_str("internal"), - Server::External(_) => f.write_str("external"), + Server::Internal { .. } => f.write_str("internal"), + Server::External { .. } => f.write_str("external"), } } } -impl From for Router> { - fn from(server: Server) -> Self { +impl From<&Server> for Router> { + fn from(server: &Server) -> Self { let mut router = Router::>::new(); router = match server { - Server::Internal(_) => { + Server::Internal { .. } => { router = router .route( "/extension_server/{*filename}", @@ -69,59 +79,71 @@ impl From for Router> { router } - Server::External(_) => router - .route("/check_writability", post(check_writability::is_writable)) - .route("/configure", post(configure::configure)) - .route("/database_schema", get(database_schema::get_schema_dump)) - .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) - .route("/insights", get(insights::get_insights)) - .route("/metrics", get(metrics::get_metrics)) - .route("/metrics.json", get(metrics_json::get_metrics)) - .route("/status", get(status::get_status)) - .route("/terminate", post(terminate::terminate)), + Server::External { + jwks, compute_id, .. + } => { + let unauthenticated_router = + Router::>::new().route("/metrics", get(metrics::get_metrics)); + + let authenticated_router = Router::>::new() + .route("/check_writability", post(check_writability::is_writable)) + .route("/configure", post(configure::configure)) + .route("/database_schema", get(database_schema::get_schema_dump)) + .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) + .route("/insights", get(insights::get_insights)) + .route("/metrics.json", get(metrics_json::get_metrics)) + .route("/status", get(status::get_status)) + .route("/terminate", post(terminate::terminate)) + .layer(AsyncRequireAuthorizationLayer::new(Authorize::new( + compute_id.clone(), + jwks.clone(), + ))); + + router + .merge(unauthenticated_router) + .merge(authenticated_router) + } }; - router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer( - ServiceBuilder::new() - // Add this middleware since we assume the request ID exists - .layer(middleware::from_fn(maybe_add_request_id_header)) - .layer( - TraceLayer::new_for_http() - .on_request(|request: &http::Request<_>, _span: &Span| { - let request_id = request - .headers() - .get(X_REQUEST_ID) - .unwrap() - .to_str() - .unwrap(); - - match request.uri().path() { - "/metrics" => { - debug!(%request_id, "{} {}", request.method(), request.uri()) - } - _ => info!(%request_id, "{} {}", request.method(), request.uri()), - }; - }) - .on_response( - |response: &http::Response<_>, latency: Duration, _span: &Span| { - let request_id = response + router + .fallback(Server::handle_404) + .method_not_allowed_fallback(Server::handle_405) + .layer( + ServiceBuilder::new() + .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO)) + // Add this middleware since we assume the request ID exists + .layer(middleware::from_fn(maybe_add_request_id_header)) + .layer( + TraceLayer::new_for_http() + .on_request(|request: &http::Request<_>, _span: &Span| { + let request_id = request .headers() .get(X_REQUEST_ID) .unwrap() .to_str() .unwrap(); - info!( - %request_id, - code = response.status().as_u16(), - latency = latency.as_millis() - ) - }, - ), - ) - .layer(PropagateRequestIdLayer::x_request_id()), - ) - .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO)) + info!(%request_id, "{} {}", request.method(), request.uri()); + }) + .on_response( + |response: &http::Response<_>, latency: Duration, _span: &Span| { + let request_id = response + .headers() + .get(X_REQUEST_ID) + .unwrap() + .to_str() + .unwrap(); + + info!( + %request_id, + code = response.status().as_u16(), + latency = latency.as_millis() + ); + }, + ), + ) + .layer(PropagateRequestIdLayer::x_request_id()), + ) } } @@ -145,15 +167,15 @@ impl Server { match self { // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners // allow binding to localhost - Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), - Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), + Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED), + Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED), } } - fn port(self) -> u16 { + fn port(&self) -> u16 { match self { - Server::Internal(port) => port, - Server::External(port) => port, + Server::Internal { port, .. } => *port, + Server::External { port, .. } => *port, } } @@ -180,7 +202,9 @@ impl Server { ); } - let router = Router::from(self).with_state(compute); + let router = Router::from(&self) + .with_state(compute) + .into_make_service_with_connect_info::(); if let Err(e) = axum::serve(listener, router).await { error!("compute_ctl {} HTTP server error: {}", self, e); diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index b08df22134..5c78bbcd02 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -21,6 +21,7 @@ mod migration; pub mod monitor; pub mod params; pub mod pg_helpers; +pub mod rsyslog; pub mod spec; mod spec_apply; pub mod swap; diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 3749dfc844..a65614e94e 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; +use tracing::info; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::prelude::*; @@ -42,3 +44,50 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result pub fn inlinify(s: &str) -> String { s.replace('\n', "\u{200B}") } + +pub fn startup_context_from_env() -> Option { + // Extract OpenTelemetry context for the startup actions from the + // TRACEPARENT and TRACESTATE env variables, and attach it to the current + // tracing context. + // + // This is used to propagate the context for the 'start_compute' operation + // from the neon control plane. This allows linking together the wider + // 'start_compute' operation that creates the compute container, with the + // startup actions here within the container. + // + // There is no standard for passing context in env variables, but a lot of + // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See + // https://github.com/open-telemetry/opentelemetry-specification/issues/740 + // + // Switch to the startup context here, and exit it once the startup has + // completed and Postgres is up and running. + // + // If this pod is pre-created without binding it to any particular endpoint + // yet, this isn't the right place to enter the startup context. In that + // case, the control plane should pass the tracing context as part of the + // /configure API call. + // + // NOTE: This is supposed to only cover the *startup* actions. Once + // postgres is configured and up-and-running, we exit this span. Any other + // actions that are performed on incoming HTTP requests, for example, are + // performed in separate spans. + // + // XXX: If the pod is restarted, we perform the startup actions in the same + // context as the original startup actions, which probably doesn't make + // sense. + let mut startup_tracing_carrier: HashMap = HashMap::new(); + if let Ok(val) = std::env::var("TRACEPARENT") { + startup_tracing_carrier.insert("traceparent".to_string(), val); + } + if let Ok(val) = std::env::var("TRACESTATE") { + startup_tracing_carrier.insert("tracestate".to_string(), val); + } + if !startup_tracing_carrier.is_empty() { + use opentelemetry::propagation::TextMapPropagator; + use opentelemetry_sdk::propagation::TraceContextPropagator; + info!("got startup tracing context from env variables"); + Some(TraceContextPropagator::new().extract(&startup_tracing_carrier)) + } else { + None + } +} diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index bc96e5074c..dab32d5dc1 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -54,9 +54,7 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy = Lazy::new(|| register_int_counter_vec!( "compute_ctl_remote_ext_requests_total", "Total number of requests made by compute_ctl to download extensions from S3 proxy by status", - // Do not use any labels like extension name yet. - // We can add them later if needed. - &["http_status"] + &["http_status", "filename"] ) .expect("failed to define a metric") }); diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 248505e473..83318538cd 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -18,7 +18,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); // should be handled gracefully. fn watch_compute_activity(compute: &ComputeNode) { // Suppose that `connstr` doesn't change - let connstr = compute.connstr.clone(); + let connstr = compute.params.connstr.clone(); let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor")); // During startup and configuration we connect to every Postgres database, diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 5a2e305e1d..dd8d8e9b8b 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -186,15 +186,40 @@ impl DatabaseExt for Database { /// Postgres SQL queries and DATABASE_URL. pub trait Escaping { fn pg_quote(&self) -> String; + fn pg_quote_dollar(&self) -> (String, String); } impl Escaping for PgIdent { /// This is intended to mimic Postgres quote_ident(), but for simplicity it /// always quotes provided string with `""` and escapes every `"`. /// **Not idempotent**, i.e. if string is already escaped it will be escaped again. + /// N.B. it's not useful for escaping identifiers that are used inside WHERE + /// clause, use `escape_literal()` instead. fn pg_quote(&self) -> String { - let result = format!("\"{}\"", self.replace('"', "\"\"")); - result + format!("\"{}\"", self.replace('"', "\"\"")) + } + + /// This helper is intended to be used for dollar-escaping strings for usage + /// inside PL/pgSQL procedures. In addition to dollar-escaping the string, + /// it also returns a tag that is intended to be used inside the outer + /// PL/pgSQL procedure. If you do not need an outer tag, just discard it. + /// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`, + /// + fn pg_quote_dollar(&self) -> (String, String) { + let mut tag: String = "".to_string(); + let mut outer_tag = "x".to_string(); + + // Find the first suitable tag that is not present in the string. + // Postgres' max role/DB name length is 63 bytes, so even in the + // worst case it won't take long. + while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) { + tag += "x"; + outer_tag = tag.clone() + "x"; + } + + let escaped = format!("${tag}${self}${tag}$"); + + (escaped, outer_tag) } } @@ -226,10 +251,13 @@ pub async fn get_existing_dbs_async( // invalid state. See: // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 let rowstream = client + // We use a subquery instead of a fancy `datdba::regrole::text AS owner`, + // because the latter automatically wraps the result in double quotes, + // if the role name contains special characters. .query_raw::( "SELECT datname AS name, - datdba::regrole::text AS owner, + (SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner, NOT datallowconn AS restrict_conn, datconnlimit = - 2 AS invalid FROM diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs new file mode 100644 index 0000000000..c8fba4fdcd --- /dev/null +++ b/compute_tools/src/rsyslog.rs @@ -0,0 +1,77 @@ +use std::process::Command; +use std::{fs::OpenOptions, io::Write}; + +use anyhow::{Context, Result}; +use tracing::info; + +fn get_rsyslog_pid() -> Option { + let output = Command::new("pgrep") + .arg("rsyslogd") + .output() + .expect("Failed to execute pgrep"); + + if !output.stdout.is_empty() { + let pid = std::str::from_utf8(&output.stdout) + .expect("Invalid UTF-8 in process output") + .trim() + .to_string(); + Some(pid) + } else { + None + } +} + +// Restart rsyslogd to apply the new configuration. +// This is necessary, because there is no other way to reload the rsyslog configuration. +// +// Rsyslogd shouldn't lose any messages, because of the restart, +// because it tracks the last read position in the log files +// and will continue reading from that position. +// TODO: test it properly +// +fn restart_rsyslog() -> Result<()> { + let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?; + info!("rsyslogd is running with pid: {}, restart it", old_pid); + + // kill it to restart + let _ = Command::new("pkill") + .arg("rsyslogd") + .output() + .context("Failed to stop rsyslogd")?; + + Ok(()) +} + +pub fn configure_audit_rsyslog( + log_directory: &str, + tag: &str, + remote_endpoint: &str, +) -> Result<()> { + let config_content: String = format!( + include_str!("config_template/compute_audit_rsyslog_template.conf"), + log_directory = log_directory, + tag = tag, + remote_endpoint = remote_endpoint + ); + + info!("rsyslog config_content: {}", config_content); + + let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf"; + let mut file = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(rsyslog_conf_path)?; + + file.write_all(config_content.as_bytes())?; + + info!( + "rsyslog configuration file {} added successfully. Starting rsyslogd", + rsyslog_conf_path + ); + + // start the service, using the configuration + restart_rsyslog()?; + + Ok(()) +} diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index f9a37c5c98..e5f7aebbf8 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -6,21 +6,22 @@ use std::sync::Arc; use anyhow::{Context, Result}; use compute_api::responses::ComputeStatus; -use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role}; +use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role}; use futures::future::join_all; use tokio::sync::RwLock; use tokio_postgres::Client; use tokio_postgres::error::SqlState; use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; -use crate::compute::{ComputeNode, ComputeState, construct_superuser_query}; +use crate::compute::{ComputeNode, ComputeState}; use crate::pg_helpers::{ - DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async, + DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async, get_existing_roles_async, }; use crate::spec_apply::ApplySpecPhase::{ - CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon, - CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, + CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser, + CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon, + DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase, }; @@ -187,7 +188,7 @@ impl ComputeNode { } for phase in [ - CreateSuperUser, + CreateNeonSuperuser, DropInvalidDatabases, RenameRoles, CreateAndAlterRoles, @@ -277,6 +278,19 @@ impl ComputeNode { phases.push(FinalizeDropLogicalSubscriptions); } + // Keep DisablePostgresDBPgAudit phase at the end, + // so that all config operations are audit logged. + match spec.audit_log_level + { + ComputeAudit::Hipaa => { + phases.push(CreatePgauditExtension); + phases.push(CreatePgauditlogtofileExtension); + phases.push(DisablePostgresDBPgAudit); + } + ComputeAudit::Log => { /* not implemented yet */ } + ComputeAudit::Disabled => {} + } + for phase in phases { debug!("Applying phase {:?}", &phase); apply_operations( @@ -455,7 +469,7 @@ pub enum PerDatabasePhase { #[derive(Clone, Debug)] pub enum ApplySpecPhase { - CreateSuperUser, + CreateNeonSuperuser, DropInvalidDatabases, RenameRoles, CreateAndAlterRoles, @@ -463,6 +477,9 @@ pub enum ApplySpecPhase { CreateAndAlterDatabases, CreateSchemaNeon, RunInEachDatabase { db: DB, subphase: PerDatabasePhase }, + CreatePgauditExtension, + CreatePgauditlogtofileExtension, + DisablePostgresDBPgAudit, HandleOtherExtensions, HandleNeonExtension, CreateAvailabilityCheck, @@ -579,14 +596,10 @@ async fn get_operations<'a>( apply_spec_phase: &'a ApplySpecPhase, ) -> Result + 'a + Send>> { match apply_spec_phase { - ApplySpecPhase::CreateSuperUser => { - let query = construct_superuser_query(spec); - - Ok(Box::new(once(Operation { - query, - comment: None, - }))) - } + ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation { + query: include_str!("sql/create_neon_superuser.sql").to_string(), + comment: None, + }))), ApplySpecPhase::DropInvalidDatabases => { let mut ctx = ctx.write().await; let databases = &mut ctx.dbs; @@ -720,14 +733,15 @@ async fn get_operations<'a>( // We do not check whether the DB exists or not, // Postgres will take care of it for us "delete_db" => { + let (db_name, outer_tag) = op.name.pg_quote_dollar(); // In Postgres we can't drop a database if it is a template. // So we need to unset the template flag first, but it could // be a retry, so we could've already dropped the database. // Check that database exists first to make it idempotent. let unset_template_query: String = format!( include_str!("sql/unset_template_for_drop_dbs.sql"), - datname_str = escape_literal(&op.name), - datname = &op.name.pg_quote() + datname = db_name, + outer_tag = outer_tag, ); // Use FORCE to drop database even if there are active connections. @@ -834,6 +848,8 @@ async fn get_operations<'a>( comment: None, }, Operation { + // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database + // (see https://www.postgresql.org/docs/current/ddl-priv.html) query: format!( "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser", db.name.pg_quote() @@ -893,9 +909,11 @@ async fn get_operations<'a>( PerDatabasePhase::DropLogicalSubscriptions => { match &db { DB::UserDB(db) => { + let (db_name, outer_tag) = db.name.pg_quote_dollar(); let drop_subscription_query: String = format!( include_str!("sql/drop_subscriptions.sql"), - datname_str = escape_literal(&db.name), + datname_str = db_name, + outer_tag = outer_tag, ); let operations = vec![Operation { @@ -934,6 +952,7 @@ async fn get_operations<'a>( DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(), DB::UserDB(db) => db.owner.pg_quote(), }; + let (escaped_role, outer_tag) = op.name.pg_quote_dollar(); Some(vec![ // This will reassign all dependent objects to the db owner @@ -948,7 +967,9 @@ async fn get_operations<'a>( Operation { query: format!( include_str!("sql/pre_drop_role_revoke_privileges.sql"), - role_name = quoted, + // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` + role_name = escaped_role, + outer_tag = outer_tag, ), comment: None, }, @@ -973,12 +994,14 @@ async fn get_operations<'a>( DB::SystemDB => return Ok(Box::new(empty())), DB::UserDB(db) => db, }; + let (db_owner, outer_tag) = db.owner.pg_quote_dollar(); let operations = vec![ Operation { query: format!( include_str!("sql/set_public_schema_owner.sql"), - db_owner = db.owner.pg_quote() + db_owner = db_owner, + outer_tag = outer_tag, ), comment: None, }, @@ -1098,6 +1121,25 @@ async fn get_operations<'a>( } Ok(Box::new(empty())) } + ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"), + comment: Some(String::from("create pgaudit extensions")), + }))), + ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"), + comment: Some(String::from("create pgauditlogtofile extensions")), + }))), + // Disable pgaudit logging for postgres database. + // Postgres is neon system database used by monitors + // and compute_ctl tuning functions and thus generates a lot of noise. + // We do not consider data stored in this database as sensitive. + ApplySpecPhase::DisablePostgresDBPgAudit => { + let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'"; + Ok(Box::new(once(Operation { + query: query.to_string(), + comment: Some(query.to_string()), + }))) + } ApplySpecPhase::HandleNeonExtension => { let operations = vec![ Operation { diff --git a/compute_tools/src/sql/create_neon_superuser.sql b/compute_tools/src/sql/create_neon_superuser.sql new file mode 100644 index 0000000000..300645627b --- /dev/null +++ b/compute_tools/src/sql/create_neon_superuser.sql @@ -0,0 +1,8 @@ +DO $$ + BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser') + THEN + CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data; + END IF; + END +$$; diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql index 03e8e158fa..f5d9420130 100644 --- a/compute_tools/src/sql/drop_subscriptions.sql +++ b/compute_tools/src/sql/drop_subscriptions.sql @@ -1,4 +1,4 @@ -DO $$ +DO ${outer_tag}$ DECLARE subname TEXT; BEGIN @@ -9,4 +9,4 @@ BEGIN EXECUTE format('DROP SUBSCRIPTION %I;', subname); END LOOP; END; -$$; +${outer_tag}$; diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql index cdaa7071d3..4342650591 100644 --- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql +++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql @@ -1,6 +1,6 @@ SET SESSION ROLE neon_superuser; -DO $$ +DO ${outer_tag}$ DECLARE schema TEXT; revoke_query TEXT; @@ -16,13 +16,15 @@ BEGIN WHERE schema_name IN ('public') LOOP revoke_query := format( - 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;', - schema + 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY neon_superuser;', + schema, + -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` + {role_name} ); EXECUTE revoke_query; END LOOP; END; -$$; +${outer_tag}$; RESET ROLE; diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql index fd061a713e..dc502c6d2d 100644 --- a/compute_tools/src/sql/set_public_schema_owner.sql +++ b/compute_tools/src/sql/set_public_schema_owner.sql @@ -1,5 +1,4 @@ -DO -$$ +DO ${outer_tag}$ DECLARE schema_owner TEXT; BEGIN @@ -16,8 +15,8 @@ $$ IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin' THEN - ALTER SCHEMA public OWNER TO {db_owner}; + EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner}); END IF; END IF; END -$$; \ No newline at end of file +${outer_tag}$; \ No newline at end of file diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql index 6c4343a589..36dc648beb 100644 --- a/compute_tools/src/sql/unset_template_for_drop_dbs.sql +++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql @@ -1,12 +1,12 @@ -DO $$ +DO ${outer_tag}$ BEGIN IF EXISTS( SELECT 1 FROM pg_catalog.pg_database - WHERE datname = {datname_str} + WHERE datname = {datname} ) THEN - ALTER DATABASE {datname} is_template false; + EXECUTE format('ALTER DATABASE %I is_template false', {datname}); END IF; END -$$; \ No newline at end of file +${outer_tag}$; diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 4961bc293d..f2d74ff384 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -61,6 +61,23 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\""); } + #[test] + fn ident_pg_quote_dollar() { + let test_cases = vec![ + ("name", ("$$name$$", "x")), + ("name$$", ("$x$name$$$x$", "xx")), + ("name$$$", ("$x$name$$$$x$", "xx")), + ("name$$$$", ("$x$name$$$$$x$", "xx")), + ("name$x$", ("$xx$name$x$$xx$", "xxx")), + ]; + + for (input, expected) in test_cases { + let (escaped, tag) = PgIdent::from(input).pg_quote_dollar(); + assert_eq!(escaped, expected.0); + assert_eq!(tag, expected.1); + } + } + #[test] fn generic_options_search() { let generic_options: GenericOptions = Some(vec![ diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index f258025428..375b5d87d0 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -40,6 +40,7 @@ use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInf use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; +use safekeeper_api::membership::SafekeeperGeneration; use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, @@ -596,7 +597,15 @@ struct EndpointStartCmdArgs { #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, - #[clap(long)] + #[clap( + long, + help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations." + )] + safekeepers_generation: Option, + #[clap( + long, + help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override." + )] safekeepers: Option, #[clap( @@ -617,9 +626,9 @@ struct EndpointStartCmdArgs { )] allow_multiple: bool, - #[clap(short = 't', long, help = "timeout until we fail the command")] - #[arg(default_value = "10s")] - start_timeout: humantime::Duration, + #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")] + #[arg(default_value = "90s")] + start_timeout: Duration, } #[derive(clap::Args)] @@ -1350,6 +1359,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res let pageserver_id = args.endpoint_pageserver_id; let remote_ext_config = &args.remote_ext_config; + let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new); // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? { @@ -1425,11 +1435,13 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res endpoint .start( &auth_token, + safekeepers_generation, safekeepers, pageservers, remote_ext_config.as_ref(), stripe_size.0 as usize, args.create_test_user, + args.start_timeout, ) .await?; } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 50ccca36fe..b46d616827 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -42,17 +42,19 @@ use std::path::PathBuf; use std::process::Command; use std::str::FromStr; use std::sync::Arc; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use anyhow::{Context, Result, anyhow, bail}; use compute_api::requests::ConfigurationRequest; use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse}; use compute_api::spec::{ - Cluster, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role, + Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, + RemoteExtSpec, Role, }; use nix::sys::signal::{Signal, kill}; use pageserver_api::shard::ShardStripeSize; use reqwest::header::CONTENT_TYPE; +use safekeeper_api::membership::SafekeeperGeneration; use serde::{Deserialize, Serialize}; use tracing::debug; use url::Host; @@ -576,14 +578,17 @@ impl Endpoint { Ok(safekeeper_connstrings) } + #[allow(clippy::too_many_arguments)] pub async fn start( &self, auth_token: &Option, + safekeepers_generation: Option, safekeepers: Vec, pageservers: Vec<(Host, u16)>, remote_ext_config: Option<&String>, shard_stripe_size: usize, create_test_user: bool, + start_timeout: Duration, ) -> Result<()> { if self.status() == EndpointStatus::Running { anyhow::bail!("The endpoint is already running"); @@ -655,6 +660,7 @@ impl Endpoint { timeline_id: Some(self.timeline_id), mode: self.mode, pageserver_connstring: Some(pageserver_connstring), + safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()), safekeeper_connstrings, storage_auth_token: auth_token.clone(), remote_extensions, @@ -663,6 +669,7 @@ impl Endpoint { local_proxy_config: None, reconfigure_concurrency: self.reconfigure_concurrency, drop_subscriptions_before_start: self.drop_subscriptions_before_start, + audit_log_level: ComputeAudit::Disabled, }; // this strange code is needed to support respec() in tests @@ -770,17 +777,18 @@ impl Endpoint { std::fs::write(pidfile_path, pid.to_string())?; // Wait for it to start - let mut attempt = 0; const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100); - const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min + let start_at = Instant::now(); loop { - attempt += 1; match self.get_status().await { Ok(state) => { match state.status { ComputeStatus::Init => { - if attempt == MAX_ATTEMPTS { - bail!("compute startup timed out; still in Init state"); + if Instant::now().duration_since(start_at) > start_timeout { + bail!( + "compute startup timed out {:?}; still in Init state", + start_timeout + ); } // keep retrying } @@ -807,8 +815,11 @@ impl Endpoint { } } Err(e) => { - if attempt == MAX_ATTEMPTS { - return Err(e).context("timed out waiting to connect to compute_ctl HTTP"); + if Instant::now().duration_since(start_at) > start_timeout { + return Err(e).context(format!( + "timed out {:?} waiting to connect to compute_ctl HTTP", + start_timeout, + )); } } } diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh index 6e6c41538d..51d1e40802 100755 --- a/docker-compose/test_extensions_upgrade.sh +++ b/docker-compose/test_extensions_upgrade.sh @@ -6,8 +6,11 @@ generate_id() { local -n resvar=$1 printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM } -if [ -z ${OLD_COMPUTE_TAG+x} ] || [ -z ${NEW_COMPUTE_TAG+x} ] || [ -z "${OLD_COMPUTE_TAG}" ] || [ -z "${NEW_COMPUTE_TAG}" ]; then - echo OLD_COMPUTE_TAG and NEW_COMPUTE_TAG must be defined +echo "${OLD_COMPUTE_TAG}" +echo "${NEW_COMPUTE_TAG}" +echo "${TEST_EXTENSIONS_TAG}" +if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then + echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set exit 1 fi export PG_VERSION=${PG_VERSION:-16} @@ -58,7 +61,7 @@ function check_timeline() { # Accepts the tag for the compute node and the timeline as parameters. function restart_compute() { docker compose down compute compute_is_ready - COMPUTE_TAG=${1} TAG=${OLD_COMPUTE_TAG} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready + COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready wait_for_ready check_timeline ${2} } @@ -82,7 +85,7 @@ EXTENSIONS='[ {"extname": "pg_repack", "extdir": "pg_repack-src"} ]' EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -) -TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d +COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d wait_for_ready docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" @@ -90,7 +93,7 @@ create_extensions "${EXTNAMES}" query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')" new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query") docker compose --profile test-extensions down -TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate +COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate wait_for_ready docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" diff --git a/docs/rfcs/041-rel-sparse-keyspace.md b/docs/rfcs/041-rel-sparse-keyspace.md new file mode 100644 index 0000000000..03e68bd5c1 --- /dev/null +++ b/docs/rfcs/041-rel-sparse-keyspace.md @@ -0,0 +1,201 @@ +# Sparse Keyspace for Relation Directories + +## Summary + +This is an RFC describing a new storage strategy for storing relation directories. + +## Motivation + +Postgres maintains a directory structure for databases and relations. In Neon, we store these information +by serializing the directory data in a single key (see `pgdatadir_mapping.rs`). + +```rust +// DbDir: +// 00 00000000 00000000 00000000 00 00000000 + +// RelDir: +// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) +``` + +We have a dedicated structure on the ingestion path to serialize the relation directory into this single key. + +```rust +#[derive(Debug, Serialize, Deserialize, Default)] +pub(crate) struct RelDirectory { + // Set of relations that exist. (relfilenode, forknum) + // + // TODO: Store it as a btree or radix tree or something else that spans multiple + // key-value pairs, if you have a lot of relations + pub(crate) rels: HashSet<(Oid, u8)>, +} +``` + +The current codebase has the following three access patterns for the relation directory. + +1. Check if a relation exists. +2. List all relations. +3. Create/drop a relation. + +For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the +hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get +and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back. + +If we have 100k relations in a database, we would have a 100k-large hash set. Then, every +relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the +relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path, +we would have to deserialize this super big 100k-large key before checking if a single relation exists. + +In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how +to seamlessly migrate users to use the new keyspace. + +The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316). + +## Key Mapping + +We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in +[038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>` +for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`), +into the key. + +```plain +(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted +(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists +``` + +Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be +implemented as follows. + +1. Check if a relation exists: check if the key maps to "exists". +2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key. +3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will + be removed during image layer generation upon compaction. + +Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum. +The mapping is implemented as `rel_tag_sparse_key` in the PoC patch. + +## Changes to Sparse Keyspace + +Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir +information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs +to be updated accordingly to accommodate such "inherited sparse keys". This is done in +[PR#10313](https://github.com/neondatabase/neon/pull/10313). + +## Coexistence of the Old and New Keyspaces + +Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the +ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read +path needs to combine the data from both keyspaces. + +Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the +new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration +process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the +migration can happen seamlessly and imposes no potential downtime for the user. + +With the coexistence assumption, the 3 reldir operations will be implemented as follows: + +1. Check if a relation exists + - Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly + return it to the user. + - Otherwise, deserialize the old reldir key and get the result. +2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key. + Combine them to obtain the final result. +3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace. + - We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check. + - For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace. + - For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key, + remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace. + - The delete tombstone will be removed during image layer generation upon compaction. + +This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total +amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction. +There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal +with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives +us `O(1)` complexity after fully opt-in the sparse keyspace. + +The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible +to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN. + +We will introduce a config item and an index_part record to record the current status of the migration process. + +- Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace. +- `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace. + +If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update +`index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to +`false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still +read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only: +once v2 is enabled, the user cannot go back to v1. + +## Next Steps + +### Full Migration + +This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and +v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this +code path, we must ensure the timeline has no old reldir data. + +We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces: +the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while +copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in +the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers` +process discovers the following keys at this LSN. + +```plain +db1/reldir_key -> (table 1, table 2, table 3) +...db1 rel keys +db2/reldir_key -> (table 4, table 5, table 6) +...db2 rel keys +sparse_reldir_db2_table7 -> exists +sparse_reldir_db1_table8 -> deleted +``` + +It will generate the following keys: + +```plain +db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`. +...db1 rel keys +db2/reldir_key -> () +...db2 rel keys + +-- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180 +sparse_reldir_db1_table1 -> exists +sparse_reldir_db1_table2 -> exists +sparse_reldir_db1_table3 -> exists +sparse_reldir_db2_table4 -> exists +sparse_reldir_db2_table5 -> exists +sparse_reldir_db2_table6 -> exists +sparse_reldir_db2_table7 -> exists +-- end image layer for the sparse keyspace at sparse_reldir_prefix+1 + +# The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace. +# Note that the read path will stop reading if a key is not found in the image layer covering the key range so there +# are no correctness issue. +``` + +We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before +we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images +above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or +in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to +`Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we +don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers. + +The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code. + +### Consolidate Relation Size Keys + +We have relsize at the end of all relation nodes. + +```plain +// RelSize: +// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF +``` + +This means that computing logical size requires us to do several single-key gets across the keyspace, +potentially requiring downloading many layer files. We could consolidate them into a single +keyspace, improving logical size calculation performance. + +### Migrate DBDir Keys + +We assume the number of databases created by the users will be small, and therefore, the current way +of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into +the sparse keyspace to support large amount of databases. diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 35c580bd37..3300fbf7dd 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -134,8 +134,10 @@ pub struct CatalogObjects { pub databases: Vec, } -#[derive(Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct ComputeCtlConfig { + /// Set of JSON web keys that the compute can use to authenticate + /// communication from the control plane. pub jwks: JwkSet, } diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index d02bfd6814..77f2e1e631 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -101,6 +101,17 @@ pub struct ComputeSpec { pub timeline_id: Option, pub pageserver_connstring: Option, + /// Safekeeper membership config generation. It is put in + /// neon.safekeepers GUC and serves two purposes: + /// 1) Non zero value forces walproposer to use membership configurations. + /// 2) If walproposer wants to update list of safekeepers to connect to + /// taking them from some safekeeper mconf, it should check what value + /// is newer by comparing the generation. + /// + /// Note: it could be SafekeeperGeneration, but this needs linking + /// compute_ctl with postgres_ffi. + #[serde(default)] + pub safekeepers_generation: Option, #[serde(default)] pub safekeeper_connstrings: Vec, @@ -144,6 +155,16 @@ pub struct ComputeSpec { /// over the same replication content from publisher. #[serde(default)] // Default false pub drop_subscriptions_before_start: bool, + + /// Log level for audit logging: + /// + /// Disabled - no audit logging. This is the default. + /// log - log masked statements to the postgres log using pgaudit extension + /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension + /// + /// Extensions should be present in shared_preload_libraries + #[serde(default)] + pub audit_log_level: ComputeAudit, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -251,6 +272,17 @@ pub enum ComputeMode { Replica, } +/// Log level for audit logging +/// Disabled, log, hipaa +/// Default is Disabled +#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +pub enum ComputeAudit { + #[default] + Disabled, + Log, + Hipaa, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub struct Cluster { pub cluster_id: Option, diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml index d72e4bd012..d16dac7876 100644 --- a/libs/http-utils/Cargo.toml +++ b/libs/http-utils/Cargo.toml @@ -6,11 +6,8 @@ license.workspace = true [dependencies] anyhow.workspace = true -backtrace.workspace = true bytes.workspace = true -inferno.workspace = true fail.workspace = true -flate2.workspace = true hyper0.workspace = true itertools.workspace = true jemalloc_pprof.workspace = true diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs index 6128113580..f4f93df62f 100644 --- a/libs/http-utils/src/endpoint.rs +++ b/libs/http-utils/src/endpoint.rs @@ -3,8 +3,6 @@ use std::io::Write as _; use std::str::FromStr; use std::time::Duration; -use ::pprof::ProfilerGuardBuilder; -use ::pprof::protos::Message as _; use anyhow::{Context, anyhow}; use bytes::{Bytes, BytesMut}; use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName}; @@ -12,7 +10,8 @@ use hyper::http::HeaderValue; use hyper::{Body, Method, Request, Response}; use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter}; use once_cell::sync::Lazy; -use regex::Regex; +use pprof::ProfilerGuardBuilder; +use pprof::protos::Message as _; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; use tokio::sync::{Mutex, Notify, mpsc}; @@ -22,7 +21,6 @@ use tracing::{Instrument, debug, info, info_span, warn}; use utils::auth::{AuthError, Claims, SwappableJwtAuth}; use crate::error::{ApiError, api_error_handler, route_error_handler}; -use crate::pprof; use crate::request::{get_query_param, parse_query_param}; static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { @@ -449,20 +447,6 @@ pub async fn profile_heap_handler(req: Request) -> Result, Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), }; - // Functions and mappings to strip when symbolizing pprof profiles. If true, - // also remove child frames. - static STRIP_FUNCTIONS: Lazy> = Lazy::new(|| { - vec![ - (Regex::new("^__rust").unwrap(), false), - (Regex::new("^_start$").unwrap(), false), - (Regex::new("^irallocx_prof").unwrap(), true), - (Regex::new("^prof_alloc_prep").unwrap(), true), - (Regex::new("^std::rt::lang_start").unwrap(), false), - (Regex::new("^std::sys::backtrace::__rust").unwrap(), false), - ] - }); - const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"]; - // Obtain profiler handle. let mut prof_ctl = jemalloc_pprof::PROF_CTL .as_ref() @@ -495,45 +479,27 @@ pub async fn profile_heap_handler(req: Request) -> Result, } Format::Pprof => { - let data = tokio::task::spawn_blocking(move || { - let bytes = prof_ctl.dump_pprof()?; - // Symbolize the profile. - // TODO: consider moving this upstream to jemalloc_pprof and avoiding the - // serialization roundtrip. - let profile = pprof::decode(&bytes)?; - let profile = pprof::symbolize(profile)?; - let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); - pprof::encode(&profile) - }) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(ApiError::InternalServerError)?; + let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; Response::builder() .status(200) .header(CONTENT_TYPE, "application/octet-stream") - .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"") + .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb.gz\"") .body(Body::from(data)) .map_err(|err| ApiError::InternalServerError(err.into())) } Format::Svg => { - let body = tokio::task::spawn_blocking(move || { - let bytes = prof_ctl.dump_pprof()?; - let profile = pprof::decode(&bytes)?; - let profile = pprof::symbolize(profile)?; - let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); - let mut opts = inferno::flamegraph::Options::default(); - opts.title = "Heap inuse".to_string(); - opts.count_name = "bytes".to_string(); - pprof::flamegraph(profile, &mut opts) - }) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(ApiError::InternalServerError)?; + let svg = tokio::task::spawn_blocking(move || prof_ctl.dump_flamegraph()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; Response::builder() .status(200) .header(CONTENT_TYPE, "image/svg+xml") - .body(Body::from(body)) + .body(Body::from(svg)) .map_err(|err| ApiError::InternalServerError(err.into())) } } diff --git a/libs/http-utils/src/lib.rs b/libs/http-utils/src/lib.rs index c692a54257..1e9b3c761a 100644 --- a/libs/http-utils/src/lib.rs +++ b/libs/http-utils/src/lib.rs @@ -2,7 +2,6 @@ pub mod endpoint; pub mod error; pub mod failpoints; pub mod json; -pub mod pprof; pub mod request; extern crate hyper0 as hyper; diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs deleted file mode 100644 index 529017f350..0000000000 --- a/libs/http-utils/src/pprof.rs +++ /dev/null @@ -1,238 +0,0 @@ -use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; -use std::ffi::c_void; -use std::io::Write as _; - -use anyhow::bail; -use flate2::Compression; -use flate2::write::{GzDecoder, GzEncoder}; -use itertools::Itertools as _; -use pprof::protos::{Function, Line, Location, Message as _, Profile}; -use regex::Regex; - -/// Decodes a gzip-compressed Protobuf-encoded pprof profile. -pub fn decode(bytes: &[u8]) -> anyhow::Result { - let mut gz = GzDecoder::new(Vec::new()); - gz.write_all(bytes)?; - Ok(Profile::parse_from_bytes(&gz.finish()?)?) -} - -/// Encodes a pprof profile as gzip-compressed Protobuf. -pub fn encode(profile: &Profile) -> anyhow::Result> { - let mut gz = GzEncoder::new(Vec::new(), Compression::default()); - profile.write_to_writer(&mut gz)?; - Ok(gz.finish()?) -} - -/// Symbolizes a pprof profile using the current binary. -pub fn symbolize(mut profile: Profile) -> anyhow::Result { - if !profile.function.is_empty() { - return Ok(profile); // already symbolized - } - - // Collect function names. - let mut functions: HashMap = HashMap::new(); - let mut strings: HashMap = profile - .string_table - .into_iter() - .enumerate() - .map(|(i, s)| (s, i as i64)) - .collect(); - - // Helper to look up or register a string. - let mut string_id = |s: &str| -> i64 { - // Don't use .entry() to avoid unnecessary allocations. - if let Some(id) = strings.get(s) { - return *id; - } - let id = strings.len() as i64; - strings.insert(s.to_string(), id); - id - }; - - for loc in &mut profile.location { - if !loc.line.is_empty() { - continue; - } - - // Resolve the line and function for each location. - backtrace::resolve(loc.address as *mut c_void, |symbol| { - let Some(symbol_name) = symbol.name() else { - return; - }; - - let function_name = format!("{symbol_name:#}"); - let functions_len = functions.len(); - let function_id = functions - .entry(function_name) - .or_insert_with_key(|function_name| { - let function_id = functions_len as u64 + 1; - let system_name = String::from_utf8_lossy(symbol_name.as_bytes()); - let filename = symbol - .filename() - .map(|path| path.to_string_lossy()) - .unwrap_or(Cow::Borrowed("")); - Function { - id: function_id, - name: string_id(function_name), - system_name: string_id(&system_name), - filename: string_id(&filename), - ..Default::default() - } - }) - .id; - loc.line.push(Line { - function_id, - line: symbol.lineno().unwrap_or(0) as i64, - ..Default::default() - }); - }); - } - - // Store the resolved functions, and mark the mapping as resolved. - profile.function = functions.into_values().sorted_by_key(|f| f.id).collect(); - profile.string_table = strings - .into_iter() - .sorted_by_key(|(_, i)| *i) - .map(|(s, _)| s) - .collect(); - - for mapping in &mut profile.mapping { - mapping.has_functions = true; - mapping.has_filenames = true; - } - - Ok(profile) -} - -/// Strips locations (stack frames) matching the given mappings (substring) or function names -/// (regex). The function bool specifies whether child frames should be stripped as well. -/// -/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all -/// string references. -pub fn strip_locations( - mut profile: Profile, - mappings: &[&str], - functions: &[(Regex, bool)], -) -> Profile { - // Strip mappings. - let mut strip_mappings: HashSet = HashSet::new(); - - profile.mapping.retain(|mapping| { - let Some(name) = profile.string_table.get(mapping.filename as usize) else { - return true; - }; - if mappings.iter().any(|substr| name.contains(substr)) { - strip_mappings.insert(mapping.id); - return false; - } - true - }); - - // Strip functions. - let mut strip_functions: HashMap = HashMap::new(); - - profile.function.retain(|function| { - let Some(name) = profile.string_table.get(function.name as usize) else { - return true; - }; - for (regex, strip_children) in functions { - if regex.is_match(name) { - strip_functions.insert(function.id, *strip_children); - return false; - } - } - true - }); - - // Strip locations. The bool specifies whether child frames should be stripped too. - let mut strip_locations: HashMap = HashMap::new(); - - profile.location.retain(|location| { - for line in &location.line { - if let Some(strip_children) = strip_functions.get(&line.function_id) { - strip_locations.insert(location.id, *strip_children); - return false; - } - } - if strip_mappings.contains(&location.mapping_id) { - strip_locations.insert(location.id, false); - return false; - } - true - }); - - // Strip sample locations. - for sample in &mut profile.sample { - // First, find the uppermost function with child removal and truncate the stack. - if let Some(truncate) = sample - .location_id - .iter() - .rposition(|id| strip_locations.get(id) == Some(&true)) - { - sample.location_id.drain(..=truncate); - } - // Next, strip any individual frames without child removal. - sample - .location_id - .retain(|id| !strip_locations.contains_key(id)); - } - - profile -} - -/// Generates an SVG flamegraph from a symbolized pprof profile. -pub fn flamegraph( - profile: Profile, - opts: &mut inferno::flamegraph::Options, -) -> anyhow::Result> { - if profile.mapping.iter().any(|m| !m.has_functions) { - bail!("profile not symbolized"); - } - - // Index locations, functions, and strings. - let locations: HashMap = - profile.location.into_iter().map(|l| (l.id, l)).collect(); - let functions: HashMap = - profile.function.into_iter().map(|f| (f.id, f)).collect(); - let strings = profile.string_table; - - // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack, - // since inferno expects it bottom-up. - let mut stacks: HashMap, i64> = HashMap::new(); - for sample in profile.sample { - let mut stack = Vec::with_capacity(sample.location_id.len()); - for location in sample.location_id.into_iter().rev() { - let Some(location) = locations.get(&location) else { - bail!("missing location {location}"); - }; - for line in location.line.iter().rev() { - let Some(function) = functions.get(&line.function_id) else { - bail!("missing function {}", line.function_id); - }; - let Some(name) = strings.get(function.name as usize) else { - bail!("missing string {}", function.name); - }; - stack.push(name.as_str()); - } - } - let Some(&value) = sample.value.first() else { - bail!("missing value"); - }; - *stacks.entry(stack).or_default() += value; - } - - // Construct stack lines for inferno. - let lines = stacks - .into_iter() - .map(|(stack, value)| (stack.into_iter().join(";"), value)) - .map(|(stack, value)| format!("{stack} {value}")) - .sorted() - .collect_vec(); - - // Construct the flamegraph. - let mut bytes = Vec::new(); - let lines = lines.iter().map(|line| line.as_str()); - inferno::flamegraph::from_lines(opts, lines, &mut bytes)?; - Ok(bytes) -} diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ea565e7769..749a8acc4e 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1146,6 +1146,15 @@ pub struct TimelineArchivalConfigRequest { pub state: TimelineArchivalState, } +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] +pub struct TimelinePatchIndexPartRequest { + pub rel_size_migration: Option, + pub gc_compaction_last_completed_lsn: Option, + pub applied_gc_cutoff_lsn: Option, + #[serde(default)] + pub force_index_update: bool, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelinesInfoAndOffloaded { pub timelines: Vec, @@ -1165,6 +1174,21 @@ pub struct OffloadedTimelineInfo { pub archived_at: chrono::DateTime, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum RelSizeMigration { + /// The tenant is using the old rel_size format. + /// Note that this enum is persisted as `Option` in the index part, so + /// `None` is the same as `Some(RelSizeMigration::Legacy)`. + Legacy, + /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are + /// persisted in the index part. The read path will read both formats and merge them. + Migrating, + /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted + /// in the index part, and the read path will not read the old format. + Migrated, +} + /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { @@ -1243,7 +1267,11 @@ pub struct TimelineInfo { // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does // not deny unknown fields by default so it's safe to set the field to some value, though it won't be // read. + /// Whether the timeline is archived. pub is_archived: Option, + + /// The status of the rel_size migration. + pub rel_size_migration: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs index b65fb571e6..0bdad0b554 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs @@ -34,8 +34,13 @@ where .make_tls_connect(hostname) .map_err(|e| Error::tls(e.into()))?; - let socket = - connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?; + let socket = connect_socket::connect_socket( + config.host_addr, + &config.host, + config.port, + config.connect_timeout, + ) + .await?; cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await } diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 39b1db75da..c70cb598de 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::fmt; +use std::net::IpAddr; use std::sync::Arc; use std::task::{Context, Poll}; use std::time::Duration; @@ -137,6 +138,7 @@ impl InnerClient { #[derive(Clone, Serialize, Deserialize)] pub struct SocketConfig { + pub host_addr: Option, pub host: Host, pub port: u16, pub connect_timeout: Option, diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 4c25491b67..978d348741 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -1,5 +1,6 @@ //! Connection configuration. +use std::net::IpAddr; use std::time::Duration; use std::{fmt, str}; @@ -65,6 +66,7 @@ pub enum AuthKeys { /// Connection configuration. #[derive(Clone, PartialEq, Eq)] pub struct Config { + pub(crate) host_addr: Option, pub(crate) host: Host, pub(crate) port: u16, @@ -83,6 +85,7 @@ impl Config { /// Creates a new configuration. pub fn new(host: String, port: u16) -> Config { Config { + host_addr: None, host: Host::Tcp(host), port, password: None, @@ -163,6 +166,15 @@ impl Config { self } + pub fn set_host_addr(&mut self, addr: IpAddr) -> &mut Config { + self.host_addr = Some(addr); + self + } + + pub fn get_host_addr(&self) -> Option { + self.host_addr + } + /// Sets the SSL configuration. /// /// Defaults to `prefer`. diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index d2bd0dfbcd..7c3a358bba 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -1,3 +1,5 @@ +use std::net::IpAddr; + use postgres_protocol2::message::backend::Message; use tokio::net::TcpStream; use tokio::sync::mpsc; @@ -25,13 +27,14 @@ where .make_tls_connect(hostname) .map_err(|e| Error::tls(e.into()))?; - match connect_once(&config.host, config.port, tls, config).await { + match connect_once(config.host_addr, &config.host, config.port, tls, config).await { Ok((client, connection)) => Ok((client, connection)), Err(e) => Err(e), } } async fn connect_once( + host_addr: Option, host: &Host, port: u16, tls: T, @@ -40,7 +43,7 @@ async fn connect_once( where T: TlsConnect, { - let socket = connect_socket(host, port, config.connect_timeout).await?; + let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?; let RawConnection { stream, parameters, @@ -50,6 +53,7 @@ where } = connect_raw(socket, tls, config).await?; let socket_config = SocketConfig { + host_addr, host: host.clone(), port, connect_timeout: config.connect_timeout, diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs index 15411f7ef3..8c7d300451 100644 --- a/libs/proxy/tokio-postgres2/src/connect_socket.rs +++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs @@ -1,5 +1,6 @@ use std::future::Future; use std::io; +use std::net::{IpAddr, SocketAddr}; use std::time::Duration; use tokio::net::{self, TcpStream}; @@ -9,15 +10,20 @@ use crate::Error; use crate::config::Host; pub(crate) async fn connect_socket( + host_addr: Option, host: &Host, port: u16, connect_timeout: Option, ) -> Result { match host { Host::Tcp(host) => { - let addrs = net::lookup_host((&**host, port)) - .await - .map_err(Error::connect)?; + let addrs = match host_addr { + Some(addr) => vec![SocketAddr::new(addr, port)], + None => net::lookup_host((&**host, port)) + .await + .map_err(Error::connect)? + .collect(), + }; let mut last_err = None; diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 5020d82adf..ac44300a51 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -15,7 +15,6 @@ arc-swap.workspace = true sentry.workspace = true async-compression.workspace = true anyhow.workspace = true -backtrace.workspace = true bincode.workspace = true bytes.workspace = true camino.workspace = true diff --git a/libs/utils/src/sentry_init.rs b/libs/utils/src/sentry_init.rs index d77dbba087..72d192a591 100644 --- a/libs/utils/src/sentry_init.rs +++ b/libs/utils/src/sentry_init.rs @@ -3,20 +3,24 @@ use std::env; use sentry::ClientInitGuard; pub use sentry::release_name; +use tracing::{error, info}; #[must_use] pub fn init_sentry( release_name: Option>, extra_options: &[(&str, &str)], ) -> Option { - let dsn = env::var("SENTRY_DSN").ok()?; + let Ok(dsn) = env::var("SENTRY_DSN") else { + info!("not initializing Sentry, no SENTRY_DSN given"); + return None; + }; let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into()); let guard = sentry::init(( dsn, sentry::ClientOptions { - release: release_name, - environment: Some(environment.into()), + release: release_name.clone(), + environment: Some(environment.clone().into()), ..Default::default() }, )); @@ -25,5 +29,19 @@ pub fn init_sentry( scope.set_extra(key, value.into()); } }); + + if let Some(dsn) = guard.dsn() { + info!( + "initialized Sentry for project {}, environment {}, release {} (using API {})", + dsn.project_id(), + environment, + release_name.unwrap_or(Cow::Borrowed("None")), + dsn.envelope_api_url(), + ); + } else { + // This should panic during sentry::init(), but we may as well cover it. + error!("failed to initialize Sentry, invalid DSN"); + } + Some(guard) } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 7330856be4..fa16090170 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -98,6 +98,7 @@ criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } indoc.workspace = true +uuid.workspace = true [[bench]] name = "bench_layer_map" diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index e11af49449..e1444778b8 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -7,7 +7,6 @@ use std::time::Instant; use criterion::measurement::WallTime; use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main}; -use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc}; use pageserver_api::key::Key; @@ -72,41 +71,6 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { .collect() } -// Construct a partitioning for testing get_difficulty map when we -// don't have an exact result of `collect_keyspace` to work with. -fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning { - let mut parts = Vec::new(); - - // We add a partition boundary at the start of each image layer, - // no matter what lsn range it covers. This is just the easiest - // thing to do. A better thing to do would be to get a real - // partitioning from some database. Even better, remove the need - // for key partitions by deciding where to create image layers - // directly based on a coverage-based difficulty map. - let mut keys: Vec<_> = layer_map - .iter_historic_layers() - .filter_map(|l| { - if l.is_incremental() { - None - } else { - let kr = l.get_key_range(); - Some(kr.start.next()) - } - }) - .collect(); - keys.sort(); - - let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap(); - for key in keys { - parts.push(KeySpace { - ranges: vec![current_key..key], - }); - current_key = key; - } - - KeyPartitioning { parts } -} - // Benchmark using metadata extracted from our performance test environment, from // a project where we have run pgbench many timmes. The pgbench database was initialized // between each test run. @@ -148,41 +112,6 @@ fn bench_from_real_project(c: &mut Criterion) { // Choose uniformly distributed queries let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); - // Choose inputs for get_difficulty_map - let latest_lsn = layer_map - .iter_historic_layers() - .map(|l| l.get_lsn_range().end) - .max() - .unwrap(); - let partitioning = uniform_key_partitioning(&layer_map, latest_lsn); - - // Check correctness of get_difficulty_map - // TODO put this in a dedicated test outside of this mod - { - println!("running correctness check"); - - let now = Instant::now(); - let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning); - assert!(result_bruteforce.len() == partitioning.parts.len()); - println!("Finished bruteforce in {:?}", now.elapsed()); - - let now = Instant::now(); - let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None); - assert!(result_fast.len() == partitioning.parts.len()); - println!("Finished fast in {:?}", now.elapsed()); - - // Assert results are equal. Manually iterate for easier debugging. - let zip = std::iter::zip( - &partitioning.parts, - std::iter::zip(result_bruteforce, result_fast), - ); - for (_part, (bruteforce, fast)) in zip { - assert_eq!(bruteforce, fast); - } - - println!("No issues found"); - } - // Define and name the benchmark function let mut group = c.benchmark_group("real_map"); group.bench_function("uniform_queries", |b| { @@ -192,11 +121,6 @@ fn bench_from_real_project(c: &mut Criterion) { } }); }); - group.bench_function("get_difficulty_map", |b| { - b.iter(|| { - layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3)); - }); - }); group.finish(); } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index f19b4e964d..37c914c4e9 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -480,6 +480,7 @@ impl Client { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<()> { let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers", @@ -487,6 +488,9 @@ impl Client { )) .expect("Cannot build URL"); + path.query_pairs_mut() + .append_pair("recurse", &format!("{}", recurse)); + if let Some(concurrency) = concurrency { path.query_pairs_mut() .append_pair("concurrency", &format!("{}", concurrency)); diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index ce54bd9c1c..de527e307b 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -33,8 +33,9 @@ use utils::lsn::Lsn; use crate::context::RequestContext; use crate::pgdatadir_mapping::Version; -use crate::tenant::Timeline; use crate::tenant::storage_layer::IoConcurrency; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::{PageReconstructError, Timeline}; #[derive(Debug, thiserror::Error)] pub enum BasebackupError { @@ -42,6 +43,26 @@ pub enum BasebackupError { Server(#[from] anyhow::Error), #[error("basebackup client error {0:#} when {1}")] Client(#[source] io::Error, &'static str), + #[error("basebackup during shutdown")] + Shutdown, +} + +impl From for BasebackupError { + fn from(value: PageReconstructError) -> Self { + match value { + PageReconstructError::Cancelled => BasebackupError::Shutdown, + err => BasebackupError::Server(err.into()), + } + } +} + +impl From for BasebackupError { + fn from(value: GetVectoredError) -> Self { + match value { + GetVectoredError::Cancelled => BasebackupError::Shutdown, + err => BasebackupError::Server(err.into()), + } + } } /// Create basebackup with non-rel data in it. @@ -127,7 +148,7 @@ where timeline .gate .enter() - .map_err(|e| BasebackupError::Server(e.into()))?, + .map_err(|_| BasebackupError::Shutdown)?, ), }; basebackup @@ -323,8 +344,7 @@ where let slru_partitions = self .timeline .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + .await? .partition( self.timeline.get_shard_identity(), Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, @@ -336,11 +356,10 @@ where let blocks = self .timeline .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; for (key, block) in blocks { - let block = block.map_err(|e| BasebackupError::Server(e.into()))?; + let block = block?; slru_builder.add_block(&key, block).await?; } } @@ -349,11 +368,8 @@ where let mut min_restart_lsn: Lsn = Lsn::MAX; // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in self - .timeline - .list_dbdirs(self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + for ((spcnode, dbnode), has_relmap_file) in + self.timeline.list_dbdirs(self.lsn, self.ctx).await? { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; @@ -362,8 +378,7 @@ where let rels = self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty // contents of UNLOGGED relations. Postgres copies it in @@ -391,8 +406,7 @@ where let aux_files = self .timeline .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone()) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; let aux_scan_time = start_time.elapsed(); let aux_estimated_size = aux_files .values() @@ -451,16 +465,14 @@ where for xid in self .timeline .list_twophase_files(self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + .await? { self.add_twophase_file(xid).await?; } let repl_origins = self .timeline .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone()) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; let n_origins = repl_origins.len(); if n_origins != 0 { // @@ -505,8 +517,7 @@ where let nblocks = self .timeline .get_rel_size(src, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; // If the relation is empty, create an empty file if nblocks == 0 { @@ -532,8 +543,7 @@ where // TODO: investigate using get_vectored for the entire startblk..endblk range. // But this code path is not on the critical path for most basebackups (?). .get(rel_block_to_key(src, blknum), self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; segment_data.extend_from_slice(&img[..]); } @@ -567,8 +577,7 @@ where let img = self .timeline .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; if img.len() != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE) @@ -622,8 +631,7 @@ where && self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + .await? .is_empty() { return Ok(()); @@ -674,8 +682,7 @@ where let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 33ae8c4790..06be873160 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -456,8 +456,8 @@ impl PageServerConf { no_sync: no_sync.unwrap_or(false), enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false), validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false), - load_previous_heatmap: load_previous_heatmap.unwrap_or(false), - generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(false), + load_previous_heatmap: load_previous_heatmap.unwrap_or(true), + generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true), }; // ------------------------------------------------------------ @@ -491,7 +491,9 @@ impl PageServerConf { #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf { let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into()); - Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}")) + + let test_id = uuid::Uuid::new_v4(); + Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}")) } pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self { diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 12252739fd..0fb9a240d5 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -842,6 +842,12 @@ paths: required: false schema: type: integer + - name: recurse + description: When set, will recurse with the downloads into ancestor timelines + in: query + required: false + schema: + type: boolean post: description: | Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b738d22740..3c0c23a56d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -37,7 +37,8 @@ use pageserver_api::models::{ TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode, TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo, - TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse, + TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem, + TopTenantShardsRequest, TopTenantShardsResponse, }; use pageserver_api::shard::{ShardCount, TenantShardId}; use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; @@ -63,6 +64,7 @@ use crate::tenant::mgr::{ GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError, }; +use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::remote_timeline_client::{ download_index_part, list_remote_tenant_shards, list_remote_timelines, }; @@ -481,6 +483,7 @@ async fn build_timeline_info_common( state, is_archived: Some(is_archived), + rel_size_migration: Some(timeline.get_rel_size_v2_status()), walreceiver_status, }; @@ -857,6 +860,75 @@ async fn timeline_archival_config_handler( json_response(StatusCode::OK, ()) } +/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency +/// measure only. +/// +/// Some examples of safe patches: +/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors. +/// - Force set the index part to use reldir v2 (migrating/migrated). +/// +/// Some examples of unsafe patches: +/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause +/// errors. +/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background. +async fn timeline_patch_index_part_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?; + check_permission(&request, None)?; // require global permission for this request + let state = get_state(&request); + + async { + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + if let Some(rel_size_migration) = request_data.rel_size_migration { + timeline + .update_rel_size_v2_status(rel_size_migration) + .map_err(ApiError::InternalServerError)?; + } + + if let Some(gc_compaction_last_completed_lsn) = + request_data.gc_compaction_last_completed_lsn + { + timeline + .update_gc_compaction_state(GcCompactionState { + last_completed_lsn: gc_compaction_last_completed_lsn, + }) + .map_err(ApiError::InternalServerError)?; + } + + if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn { + { + let guard = timeline.applied_gc_cutoff_lsn.lock_for_write(); + guard.store_and_unlock(applied_gc_cutoff_lsn); + } + } + + if request_data.force_index_update { + timeline + .remote_client + .force_schedule_index_upload() + .context("force schedule index upload") + .map_err(ApiError::InternalServerError)?; + } + + Ok::<_, ApiError>(()) + } + .instrument(info_span!("timeline_patch_index_part", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + %timeline_id)) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, @@ -1435,6 +1507,7 @@ async fn timeline_download_heatmap_layers_handler( let desired_concurrency = parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY); + let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false); check_permission(&request, Some(tenant_shard_id.tenant_id))?; @@ -1451,9 +1524,7 @@ async fn timeline_download_heatmap_layers_handler( .unwrap_or(DEFAULT_MAX_CONCURRENCY); let concurrency = std::cmp::min(max_concurrency, desired_concurrency); - timeline - .start_heatmap_layers_download(concurrency, &ctx) - .await?; + timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?; json_response(StatusCode::ACCEPTED, ()) } @@ -3629,6 +3700,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part", + |r| api_handler(r, timeline_patch_index_part_handler), + ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease", |r| api_handler(r, lsn_lease_handler), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index eb8a9b8e24..b5b4e5c91f 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -143,6 +143,29 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_layers_per_read_batch_global", + "Layers visited to serve a single read batch (read amplification), regardless of number of reads.", + vec![ + 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 + ], + ) + .expect("failed to define a metric") +}); + +pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_layers_per_read_amortized_global", + "Layers visited to serve a single read (read amplification). Amortized across a batch: \ + all visited layers are divided by number of reads.", + vec![ + 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 + ], + ) + .expect("failed to define a metric") +}); + pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { // We expect this to be low because of Postgres checkpoints. Let's see if that holds. register_histogram!( @@ -4074,6 +4097,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { // histograms [ &LAYERS_PER_READ_GLOBAL, + &LAYERS_PER_READ_BATCH_GLOBAL, + &LAYERS_PER_READ_AMORTIZED_GLOBAL, &DELTAS_PER_READ_GLOBAL, &WAIT_LSN_TIME, &WAL_REDO_TIME, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8972515163..ba2ed9dc81 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -392,10 +392,6 @@ impl TimelineHandles { .await .map_err(|e| match e { timeline::handle::GetError::TenantManager(e) => e, - timeline::handle::GetError::TimelineGateClosed => { - trace!("timeline gate closed"); - GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) - } timeline::handle::GetError::PerTimelineStateShutDown => { trace!("per-timeline state shut down"); GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) @@ -422,24 +418,33 @@ pub(crate) struct TenantManagerTypes; impl timeline::handle::Types for TenantManagerTypes { type TenantManagerError = GetActiveTimelineError; type TenantManager = TenantManagerWrapper; - type Timeline = Arc; + type Timeline = TenantManagerCacheItem; } -impl timeline::handle::ArcTimeline for Arc { - fn gate(&self) -> &utils::sync::gate::Gate { - &self.gate - } +pub(crate) struct TenantManagerCacheItem { + pub(crate) timeline: Arc, + #[allow(dead_code)] // we store it to keep the gate open + pub(crate) gate_guard: GateGuard, +} +impl std::ops::Deref for TenantManagerCacheItem { + type Target = Arc; + fn deref(&self) -> &Self::Target { + &self.timeline + } +} + +impl timeline::handle::Timeline for TenantManagerCacheItem { fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId { - Timeline::shard_timeline_id(self) + Timeline::shard_timeline_id(&self.timeline) } fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState { - &self.handles + &self.timeline.handles } fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity { - Timeline::get_shard_identity(self) + Timeline::get_shard_identity(&self.timeline) } } @@ -448,7 +453,7 @@ impl timeline::handle::TenantManager for TenantManagerWrappe &self, timeline_id: TimelineId, shard_selector: ShardSelector, - ) -> Result, GetActiveTimelineError> { + ) -> Result { let tenant_id = self.tenant_id.get().expect("we set this in get()"); let timeout = ACTIVE_TENANT_TIMEOUT; let wait_start = Instant::now(); @@ -491,7 +496,20 @@ impl timeline::handle::TenantManager for TenantManagerWrappe let timeline = tenant_shard .get_timeline(timeline_id, true) .map_err(GetActiveTimelineError::Timeline)?; - Ok(timeline) + + let gate_guard = match timeline.gate.enter() { + Ok(guard) => guard, + Err(_) => { + return Err(GetActiveTimelineError::Timeline( + GetTimelineError::ShuttingDown, + )); + } + }; + + Ok(TenantManagerCacheItem { + timeline, + gate_guard, + }) } } @@ -2095,6 +2113,7 @@ impl PageServerHandler { // TODO: passthrough the error site to the final error message? BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)), BasebackupError::Server(e) => QueryError::Other(e), + BasebackupError::Shutdown => QueryError::Shutdown, } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 787b1b895c..8bcc6d58ec 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -21,6 +21,7 @@ use pageserver_api::key::{ slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, }; use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::models::RelSizeMigration; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; @@ -492,7 +493,9 @@ impl Timeline { // Otherwise, read the old reldir keyspace. // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2. - if self.get_rel_size_v2_enabled() { + if let RelSizeMigration::Migrated | RelSizeMigration::Migrating = + self.get_rel_size_v2_status() + { // fetch directory listing (new) let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum); let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?) @@ -544,7 +547,7 @@ impl Timeline { forknum: *forknum, })); - if !self.get_rel_size_v2_enabled() { + if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() { return Ok(rels_v1); } @@ -599,28 +602,36 @@ impl Timeline { let n_blocks = self .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx) .await?; - let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); - for blkno in 0..n_blocks { - let block = self - .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx) - .await?; - segment.extend_from_slice(&block[..BLCKSZ as usize]); - } - Ok(segment.freeze()) - } - /// Look up given SLRU page version. - pub(crate) async fn get_slru_page_at_lsn( - &self, - kind: SlruKind, - segno: u32, - blknum: BlockNumber, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result { - assert!(self.tenant_shard_id.is_shard_zero()); - let key = slru_block_to_key(kind, segno, blknum); - self.get(key, lsn, ctx).await + let keyspace = KeySpace::single( + slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks), + ); + + let batches = keyspace.partition( + self.get_shard_identity(), + Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + ); + + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| PageReconstructError::Cancelled)?, + ); + + let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); + for batch in batches.parts { + let blocks = self + .get_vectored(batch, lsn, io_concurrency.clone(), ctx) + .await?; + + for (_key, block) in blocks { + let block = block?; + segment.extend_from_slice(&block[..BLCKSZ as usize]); + } + } + + Ok(segment.freeze()) } /// Get size of an SLRU segment @@ -829,19 +840,41 @@ impl Timeline { let nblocks = self .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx) .await?; - for blknum in (0..nblocks).rev() { - let clog_page = self - .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx) + + let keyspace = KeySpace::single( + slru_block_to_key(SlruKind::Clog, segno, 0) + ..slru_block_to_key(SlruKind::Clog, segno, nblocks), + ); + + let batches = keyspace.partition( + self.get_shard_identity(), + Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + ); + + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| PageReconstructError::Cancelled)?, + ); + + for batch in batches.parts.into_iter().rev() { + let blocks = self + .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx) .await?; - if clog_page.len() == BLCKSZ as usize + 8 { - let mut timestamp_bytes = [0u8; 8]; - timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]); - let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); + for (_key, clog_page) in blocks.into_iter().rev() { + let clog_page = clog_page?; - match f(timestamp) { - ControlFlow::Break(b) => return Ok(b), - ControlFlow::Continue(()) => (), + if clog_page.len() == BLCKSZ as usize + 8 { + let mut timestamp_bytes = [0u8; 8]; + timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]); + let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); + + match f(timestamp) { + ControlFlow::Break(b) => return Ok(b), + ControlFlow::Continue(()) => (), + } } } } @@ -1052,6 +1085,8 @@ impl Timeline { ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) }); + // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await?; let dbdir = DbDirectory::des(&buf)?; @@ -1718,6 +1753,35 @@ impl DatadirModification<'_> { Ok(()) } + /// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that + /// we enable it, we also need to persist it in `index_part.json`. + pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result { + let status = self.tline.get_rel_size_v2_status(); + let config = self.tline.get_rel_size_v2_enabled(); + match (config, status) { + (false, RelSizeMigration::Legacy) => { + // tenant config didn't enable it and we didn't write any reldir_v2 key yet + Ok(false) + } + (false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => { + // index_part already persisted that the timeline has enabled rel_size_v2 + Ok(true) + } + (true, RelSizeMigration::Legacy) => { + // The first time we enable it, we need to persist it in `index_part.json` + self.tline + .update_rel_size_v2_status(RelSizeMigration::Migrating)?; + tracing::info!("enabled rel_size_v2"); + Ok(true) + } + (true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => { + // index_part already persisted that the timeline has enabled rel_size_v2 + // and we don't need to do anything + Ok(true) + } + } + } + /// Store a relmapper file (pg_filenode.map) in the repository pub async fn put_relmap_file( &mut self, @@ -1726,6 +1790,8 @@ impl DatadirModification<'_> { img: Bytes, ctx: &RequestContext, ) -> anyhow::Result<()> { + let v2_enabled = self.maybe_enable_rel_size_v2()?; + // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY, ctx).await?; let mut dbdir = DbDirectory::des(&buf)?; @@ -1746,7 +1812,7 @@ impl DatadirModification<'_> { })?; self.pending_directory_entries .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); - if self.tline.get_rel_size_v2_enabled() { + if v2_enabled { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); } @@ -1898,12 +1964,12 @@ impl DatadirModification<'_> { .context("deserialize db")? }; - // Add the new relation to the rel directory entry, and write it back - if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { - return Err(RelationError::AlreadyExists); - } + let v2_enabled = self.maybe_enable_rel_size_v2()?; - if self.tline.get_rel_size_v2_enabled() { + if v2_enabled { + if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) { + return Err(RelationError::AlreadyExists); + } let sparse_rel_dir_key = rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); // check if the rel_dir_key exists in v2 @@ -1938,6 +2004,10 @@ impl DatadirModification<'_> { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Add(1))); } else { + // Add the new relation to the rel directory entry, and write it back + if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { + return Err(RelationError::AlreadyExists); + } if !dbdir_exists { self.pending_directory_entries .push((DirectoryKind::Rel, MetricsUpdate::Set(0))) @@ -1951,6 +2021,7 @@ impl DatadirModification<'_> { )), ); } + // Put size let size_key = rel_size_to_key(rel); let buf = nblocks.to_le_bytes(); @@ -2029,6 +2100,7 @@ impl DatadirModification<'_> { drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, ) -> anyhow::Result<()> { + let v2_enabled = self.maybe_enable_rel_size_v2()?; for ((spc_node, db_node), rel_tags) in drop_relations { let dir_key = rel_dir_to_key(spc_node, db_node); let buf = self.get(dir_key, ctx).await?; @@ -2041,7 +2113,7 @@ impl DatadirModification<'_> { .push((DirectoryKind::Rel, MetricsUpdate::Sub(1))); dirty = true; true - } else if self.tline.get_rel_size_v2_enabled() { + } else if v2_enabled { // The rel is not found in the old reldir key, so we need to check the new sparse keyspace. // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion // logic). @@ -2072,7 +2144,7 @@ impl DatadirModification<'_> { // Remove entry from relation size cache self.tline.remove_cached_rel_size(&rel_tag); - // Delete size entry, as well as all blocks + // Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage. self.delete(rel_key_range(rel_tag)); } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ba1c814c4e..c78d15c9b5 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -31,8 +31,8 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use itertools::Itertools as _; use once_cell::sync::Lazy; -use pageserver_api::models; pub use pageserver_api::models::TenantState; +use pageserver_api::models::{self, RelSizeMigration}; use pageserver_api::models::{ CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem, WalRedoManagerStatus, @@ -1123,6 +1123,7 @@ impl Tenant { CreateTimelineCause::Load, idempotency.clone(), index_part.gc_compaction.clone(), + index_part.rel_size_migration.clone(), )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -1153,12 +1154,15 @@ impl Tenant { let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn())); while let Some((tline, end_lsn)) = tline_ending_at { let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await; - if !tline.is_previous_heatmap_active() { + // Another unearchived timeline might have generated a heatmap for this ancestor. + // If the current branch point greater than the previous one use the the heatmap + // we just generated - it should include more layers. + if !tline.should_keep_previous_heatmap(end_lsn) { tline .previous_heatmap .store(Some(Arc::new(unarchival_heatmap))); } else { - tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.") + tracing::info!("Previous heatmap preferred. Dropping unarchival heatmap.") } match tline.ancestor_timeline() { @@ -1943,6 +1947,7 @@ impl Tenant { hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active { heatmap: h, read_at: hs.1, + end_lsn: None, }) }); part_downloads.spawn( @@ -2446,6 +2451,7 @@ impl Tenant { create_guard, initdb_lsn, None, + None, ) .await } @@ -2501,6 +2507,7 @@ impl Tenant { initdb_lsn: Lsn, pg_version: u32, ctx: &RequestContext, + in_memory_layer_desc: Vec, delta_layer_desc: Vec, image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, @@ -2522,6 +2529,11 @@ impl Tenant { .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx) .await?; } + for in_memory in in_memory_layer_desc { + tline + .force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx) + .await?; + } let layer_names = tline .layers .read() @@ -2771,6 +2783,7 @@ impl Tenant { timeline_create_guard, initdb_lsn, None, + None, ) .await } @@ -4122,6 +4135,7 @@ impl Tenant { cause: CreateTimelineCause, create_idempotency: CreateTimelineIdempotency, gc_compaction_state: Option, + rel_size_v2_status: Option, ) -> anyhow::Result> { let state = match cause { CreateTimelineCause::Load => { @@ -4154,6 +4168,7 @@ impl Tenant { self.attach_wal_lag_cooldown.clone(), create_idempotency, gc_compaction_state, + rel_size_v2_status, self.cancel.child_token(), ); @@ -4856,6 +4871,7 @@ impl Tenant { timeline_create_guard, start_lsn + 1, Some(Arc::clone(src_timeline)), + Some(src_timeline.get_rel_size_v2_status()), ) .await?; @@ -5129,6 +5145,7 @@ impl Tenant { timeline_create_guard, pgdata_lsn, None, + None, ) .await?; @@ -5207,13 +5224,14 @@ impl Tenant { create_guard: TimelineCreateGuard, start_lsn: Lsn, ancestor: Option>, + rel_size_v2_status: Option, ) -> anyhow::Result> { let tenant_shard_id = self.tenant_shard_id; let resources = self.build_timeline_resources(new_timeline_id); resources .remote_client - .init_upload_queue_for_empty_remote(new_metadata)?; + .init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?; let timeline_struct = self .create_timeline_struct( @@ -5225,6 +5243,7 @@ impl Tenant { CreateTimelineCause::Load, create_guard.idempotency.clone(), None, + rel_size_v2_status, ) .context("Failed to create timeline data structure")?; @@ -5913,6 +5932,8 @@ mod tests { #[cfg(feature = "testing")] use timeline::GcInfo; #[cfg(feature = "testing")] + use timeline::InMemoryLayerTestDesc; + #[cfg(feature = "testing")] use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; use timeline::{CompactOptions, DeltaLayerTestDesc}; use utils::id::TenantId; @@ -7925,6 +7946,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers Vec::new(), // delta layers vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN @@ -8012,6 +8034,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers Vec::new(), // delta layers vec![( Lsn(0x20), @@ -8227,6 +8250,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( @@ -8307,6 +8331,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( @@ -8380,6 +8405,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( @@ -8512,6 +8538,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), @@ -8705,6 +8732,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x40), delta1, @@ -8761,6 +8789,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers Vec::new(), image_layers, end_lsn, @@ -8967,6 +8996,7 @@ mod tests { Lsn(0x08), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x08)..Lsn(0x10), @@ -8985,7 +9015,7 @@ mod tests { delta3, ), ], // delta layers - vec![], // image layers + vec![], // image layers Lsn(0x50), ) .await? @@ -8996,6 +9026,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x48), @@ -9546,6 +9577,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), @@ -9793,6 +9825,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ // delta1 and delta 2 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1), @@ -10028,6 +10061,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![], // delta layers vec![(Lsn(0x18), img_layer)], // image layers Lsn(0x18), @@ -10274,6 +10308,7 @@ mod tests { baseline_image_layer_lsn, DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![DeltaLayerTestDesc::new_with_inferred_key_range( delta_layer_start_lsn..delta_layer_end_lsn, delta_layer_spec, @@ -10305,6 +10340,158 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> { + let harness = + TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?; + let (tenant, ctx) = harness.load().await; + + let will_init_keys = [2, 6]; + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let mut expected_key_values = HashMap::new(); + + let baseline_image_layer_lsn = Lsn(0x10); + let mut baseline_img_layer = Vec::new(); + for i in 0..5 { + let key = get_key(i); + let value = format!("value {i}@{baseline_image_layer_lsn}"); + + let removed = expected_key_values.insert(key, value.clone()); + assert!(removed.is_none()); + + baseline_img_layer.push((key, Bytes::from(value))); + } + + let nested_image_layer_lsn = Lsn(0x50); + let mut nested_img_layer = Vec::new(); + for i in 5..10 { + let key = get_key(i); + let value = format!("value {i}@{nested_image_layer_lsn}"); + + let removed = expected_key_values.insert(key, value.clone()); + assert!(removed.is_none()); + + nested_img_layer.push((key, Bytes::from(value))); + } + + let frozen_layer = { + let lsn_range = Lsn(0x40)..Lsn(0x60); + let mut data = Vec::new(); + for i in 0..10 { + let key = get_key(i); + let key_in_nested = nested_img_layer + .iter() + .any(|(key_with_img, _)| *key_with_img == key); + let lsn = { + if key_in_nested { + Lsn(nested_image_layer_lsn.0 + 5) + } else { + lsn_range.start + } + }; + + let will_init = will_init_keys.contains(&i); + if will_init { + data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init("")))); + + expected_key_values.insert(key, "".to_string()); + } else { + let delta = format!("@{lsn}"); + data.push(( + key, + lsn, + Value::WalRecord(NeonWalRecord::wal_append(&delta)), + )); + + expected_key_values + .get_mut(&key) + .expect("An image exists for each key") + .push_str(delta.as_str()); + } + } + + InMemoryLayerTestDesc { + lsn_range, + is_open: false, + data, + } + }; + + let (open_layer, last_record_lsn) = { + let start_lsn = Lsn(0x70); + let mut data = Vec::new(); + let mut end_lsn = Lsn(0); + for i in 0..10 { + let key = get_key(i); + let lsn = Lsn(start_lsn.0 + i as u64); + let delta = format!("@{lsn}"); + data.push(( + key, + lsn, + Value::WalRecord(NeonWalRecord::wal_append(&delta)), + )); + + expected_key_values + .get_mut(&key) + .expect("An image exists for each key") + .push_str(delta.as_str()); + + end_lsn = std::cmp::max(end_lsn, lsn); + } + + ( + InMemoryLayerTestDesc { + lsn_range: start_lsn..Lsn::MAX, + is_open: true, + data, + }, + end_lsn, + ) + }; + + assert!( + nested_image_layer_lsn > frozen_layer.lsn_range.start + && nested_image_layer_lsn < frozen_layer.lsn_range.end + ); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + baseline_image_layer_lsn, + DEFAULT_PG_VERSION, + &ctx, + vec![open_layer, frozen_layer], // in-memory layers + Vec::new(), // delta layers + vec![ + (baseline_image_layer_lsn, baseline_img_layer), + (nested_image_layer_lsn, nested_img_layer), + ], // image layers + last_record_lsn, + ) + .await?; + + let keyspace = KeySpace::single(get_key(0)..get_key(10)); + let results = tline + .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx) + .await + .expect("No vectored errors"); + for (key, res) in results { + let value = res.expect("No key errors"); + let expected_value = expected_key_values.remove(&key).expect("No unknown keys"); + assert_eq!(value, Bytes::from(expected_value.clone())); + + tracing::info!("key={key} value={expected_value}"); + } + + Ok(()) + } + fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering { ( k1.is_delta, @@ -10420,6 +10607,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), @@ -10804,6 +10992,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![ // delta1/2/4 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1), @@ -11055,6 +11244,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![ // delta1/2/4 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1), diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 59f5a6bd90..2b04e53f10 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -62,8 +62,7 @@ use utils::lsn::Lsn; use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; use crate::context::RequestContext; -use crate::keyspace::KeyPartitioning; -use crate::tenant::storage_layer::InMemoryLayer; +use crate::tenant::storage_layer::{InMemoryLayer, ReadableLayerWeak}; /// /// LayerMap tracks what layers exist on a timeline. @@ -167,7 +166,7 @@ impl Drop for BatchedUpdates<'_> { /// Return value of LayerMap::search #[derive(Eq, PartialEq, Debug, Hash)] pub struct SearchResult { - pub layer: Arc, + pub layer: ReadableLayerWeak, pub lsn_floor: Lsn, } @@ -175,19 +174,37 @@ pub struct SearchResult { /// /// Contains a mapping from a layer description to a keyspace /// accumulator that contains all the keys which intersect the layer -/// from the original search space. Keys that were not found are accumulated -/// in a separate key space accumulator. +/// from the original search space. #[derive(Debug)] pub struct RangeSearchResult { pub found: HashMap, - pub not_found: KeySpaceAccum, } impl RangeSearchResult { fn new() -> Self { Self { found: HashMap::new(), - not_found: KeySpaceAccum::new(), + } + } + + fn map_to_in_memory_layer( + in_memory_layer: Option, + range: Range, + ) -> RangeSearchResult { + match in_memory_layer { + Some(inmem) => { + let search_result = SearchResult { + lsn_floor: inmem.get_lsn_range().start, + layer: ReadableLayerWeak::InMemoryLayer(inmem), + }; + + let mut accum = KeySpaceAccum::new(); + accum.add_range(range); + RangeSearchResult { + found: HashMap::from([(search_result, accum)]), + } + } + None => RangeSearchResult::new(), } } } @@ -199,6 +216,7 @@ struct RangeSearchCollector where Iter: Iterator>)>, { + in_memory_layer: Option, delta_coverage: Peekable, image_coverage: Peekable, key_range: Range, @@ -234,10 +252,12 @@ where fn new( key_range: Range, end_lsn: Lsn, + in_memory_layer: Option, delta_coverage: Iter, image_coverage: Iter, ) -> Self { Self { + in_memory_layer, delta_coverage: delta_coverage.peekable(), image_coverage: image_coverage.peekable(), key_range, @@ -266,8 +286,7 @@ where return self.result; } Some(layer_type) => { - // Changes for the range exist. Record anything before the first - // coverage change as not found. + // Changes for the range exist. let coverage_start = layer_type.next_change_at_key(); let range_before = self.key_range.start..coverage_start; self.pad_range(range_before); @@ -297,10 +316,22 @@ where self.result } - /// Mark a range as not found (i.e. no layers intersect it) + /// Map a range which does not intersect any persistent layers to + /// the in-memory layer candidate. fn pad_range(&mut self, key_range: Range) { if !key_range.is_empty() { - self.result.not_found.add_range(key_range); + if let Some(ref inmem) = self.in_memory_layer { + let search_result = SearchResult { + layer: ReadableLayerWeak::InMemoryLayer(inmem.clone()), + lsn_floor: inmem.get_lsn_range().start, + }; + + self.result + .found + .entry(search_result) + .or_default() + .add_range(key_range); + } } } @@ -310,6 +341,7 @@ where let selected = LayerMap::select_layer( self.current_delta.clone(), self.current_image.clone(), + self.in_memory_layer.clone(), self.end_lsn, ); @@ -365,6 +397,24 @@ where } } +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub struct InMemoryLayerDesc { + handle: InMemoryLayerHandle, + lsn_range: Range, +} + +impl InMemoryLayerDesc { + pub(crate) fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +enum InMemoryLayerHandle { + Open, + Frozen(usize), +} + impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given @@ -394,69 +444,161 @@ impl LayerMap { /// layer result, or simplify the api to `get_latest_image` and /// `get_latest_delta`, and only call `get_latest_image` once. /// - /// NOTE: This only searches the 'historic' layers, *not* the - /// 'open' and 'frozen' layers! - /// pub fn search(&self, key: Key, end_lsn: Lsn) -> Option { - let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; + let in_memory_layer = self.search_in_memory_layer(end_lsn); + + let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { + Some(version) => version, + None => { + return in_memory_layer.map(|desc| SearchResult { + lsn_floor: desc.get_lsn_range().start, + layer: ReadableLayerWeak::InMemoryLayer(desc), + }); + } + }; + let latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); - Self::select_layer(latest_delta, latest_image, end_lsn) + Self::select_layer(latest_delta, latest_image, in_memory_layer, end_lsn) } + /// Select a layer from three potential candidates (in-memory, delta and image layer). + /// The candidates represent the first layer of each type which intersect a key range. + /// + /// Layer types have an in implicit priority (image > delta > in-memory). For instance, + /// if we have the option of reading an LSN range from both an image and a delta, we + /// should read from the image. fn select_layer( delta_layer: Option>, image_layer: Option>, + in_memory_layer: Option, end_lsn: Lsn, ) -> Option { assert!(delta_layer.as_ref().is_none_or(|l| l.is_delta())); assert!(image_layer.as_ref().is_none_or(|l| !l.is_delta())); - match (delta_layer, image_layer) { - (None, None) => None, - (None, Some(image)) => { + match (delta_layer, image_layer, in_memory_layer) { + (None, None, None) => None, + (None, Some(image), None) => { let lsn_floor = image.get_lsn_range().start; Some(SearchResult { - layer: image, + layer: ReadableLayerWeak::PersistentLayer(image), lsn_floor, }) } - (Some(delta), None) => { + (Some(delta), None, None) => { let lsn_floor = delta.get_lsn_range().start; Some(SearchResult { - layer: delta, + layer: ReadableLayerWeak::PersistentLayer(delta), lsn_floor, }) } - (Some(delta), Some(image)) => { + (Some(delta), Some(image), None) => { let img_lsn = image.get_lsn_range().start; let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end; let image_exact_match = img_lsn + 1 == end_lsn; if image_is_newer || image_exact_match { Some(SearchResult { - layer: image, + layer: ReadableLayerWeak::PersistentLayer(image), + lsn_floor: img_lsn, + }) + } else { + // If the delta overlaps with the image in the LSN dimension, do a partial + // up to the image layer. + let lsn_floor = + std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); + Some(SearchResult { + layer: ReadableLayerWeak::PersistentLayer(delta), + lsn_floor, + }) + } + } + (None, None, Some(inmem)) => { + let lsn_floor = inmem.get_lsn_range().start; + Some(SearchResult { + layer: ReadableLayerWeak::InMemoryLayer(inmem), + lsn_floor, + }) + } + (None, Some(image), Some(inmem)) => { + // If the in-memory layer overlaps with the image in the LSN dimension, do a partial + // up to the image layer. + let img_lsn = image.get_lsn_range().start; + let image_is_newer = image.get_lsn_range().end >= inmem.get_lsn_range().end; + let image_exact_match = img_lsn + 1 == end_lsn; + if image_is_newer || image_exact_match { + Some(SearchResult { + layer: ReadableLayerWeak::PersistentLayer(image), lsn_floor: img_lsn, }) } else { let lsn_floor = - std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); + std::cmp::max(inmem.get_lsn_range().start, image.get_lsn_range().start + 1); Some(SearchResult { - layer: delta, + layer: ReadableLayerWeak::InMemoryLayer(inmem), lsn_floor, }) } } + (Some(delta), None, Some(inmem)) => { + // Overlaps between delta and in-memory layers are not a valid + // state, but we handle them here for completeness. + let delta_end = delta.get_lsn_range().end; + let delta_is_newer = delta_end >= inmem.get_lsn_range().end; + let delta_exact_match = delta_end == end_lsn; + if delta_is_newer || delta_exact_match { + Some(SearchResult { + lsn_floor: delta.get_lsn_range().start, + layer: ReadableLayerWeak::PersistentLayer(delta), + }) + } else { + // If the in-memory layer overlaps with the delta in the LSN dimension, do a partial + // up to the delta layer. + let lsn_floor = + std::cmp::max(inmem.get_lsn_range().start, delta.get_lsn_range().end); + Some(SearchResult { + layer: ReadableLayerWeak::InMemoryLayer(inmem), + lsn_floor, + }) + } + } + (Some(delta), Some(image), Some(inmem)) => { + // Determine the preferred persistent layer without taking the in-memory layer + // into consideration. + let persistent_res = + Self::select_layer(Some(delta.clone()), Some(image.clone()), None, end_lsn) + .unwrap(); + let persistent_l = match persistent_res.layer { + ReadableLayerWeak::PersistentLayer(l) => l, + ReadableLayerWeak::InMemoryLayer(_) => unreachable!(), + }; + + // Now handle the in-memory layer overlaps. + let inmem_res = if persistent_l.is_delta() { + Self::select_layer(Some(persistent_l), None, Some(inmem.clone()), end_lsn) + .unwrap() + } else { + Self::select_layer(None, Some(persistent_l), Some(inmem.clone()), end_lsn) + .unwrap() + }; + + Some(SearchResult { + layer: inmem_res.layer, + // Use the more restrictive LSN floor + lsn_floor: std::cmp::max(persistent_res.lsn_floor, inmem_res.lsn_floor), + }) + } } } pub fn range_search(&self, key_range: Range, end_lsn: Lsn) -> RangeSearchResult { + let in_memory_layer = self.search_in_memory_layer(end_lsn); + let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { Some(version) => version, None => { - let mut result = RangeSearchResult::new(); - result.not_found.add_range(key_range); - return result; + return RangeSearchResult::map_to_in_memory_layer(in_memory_layer, key_range); } }; @@ -464,7 +606,13 @@ impl LayerMap { let delta_changes = version.delta_coverage.range_overlaps(&raw_range); let image_changes = version.image_coverage.range_overlaps(&raw_range); - let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes); + let collector = RangeSearchCollector::new( + key_range, + end_lsn, + in_memory_layer, + delta_changes, + image_changes, + ); collector.collect() } @@ -571,17 +719,36 @@ impl LayerMap { } /// Get a ref counted pointer for the first in memory layer that matches the provided predicate. - pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option> - where - Pred: FnMut(&Arc) -> bool, - { + pub(crate) fn search_in_memory_layer(&self, below: Lsn) -> Option { + let is_below = |l: &Arc| { + let start_lsn = l.get_lsn_range().start; + below > start_lsn + }; + if let Some(open) = &self.open_layer { - if pred(open) { - return Some(open.clone()); + if is_below(open) { + return Some(InMemoryLayerDesc { + handle: InMemoryLayerHandle::Open, + lsn_range: open.get_lsn_range(), + }); } } - self.frozen_layers.iter().rfind(|l| pred(l)).cloned() + self.frozen_layers + .iter() + .enumerate() + .rfind(|(_idx, l)| is_below(l)) + .map(|(idx, l)| InMemoryLayerDesc { + handle: InMemoryLayerHandle::Frozen(idx), + lsn_range: l.get_lsn_range(), + }) + } + + pub(crate) fn in_memory_layer(&self, desc: &InMemoryLayerDesc) -> Arc { + match desc.handle { + InMemoryLayerHandle::Open => self.open_layer.as_ref().unwrap().clone(), + InMemoryLayerHandle::Frozen(idx) => self.frozen_layers[idx].clone(), + } } /// @@ -737,136 +904,6 @@ impl LayerMap { max_stacked_deltas } - /// Count how many reimage-worthy layers we need to visit for given key-lsn pair. - /// - /// The `partition_range` argument is used as context for the reimage-worthiness decision. - /// - /// Used as a helper for correctness checks only. Performance not critical. - pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range) -> usize { - match self.search(key, lsn) { - Some(search_result) => { - if search_result.layer.is_incremental() { - (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize) - + self.get_difficulty(search_result.lsn_floor, key, partition_range) - } else { - 0 - } - } - None => 0, - } - } - - /// Used for correctness checking. Results are expected to be identical to - /// self.get_difficulty_map. Assumes self.search is correct. - pub fn get_difficulty_map_bruteforce( - &self, - lsn: Lsn, - partitioning: &KeyPartitioning, - ) -> Vec { - // Looking at the difficulty as a function of key, it could only increase - // when a delta layer starts or an image layer ends. Therefore it's sufficient - // to check the difficulties at: - // - the key.start for each non-empty part range - // - the key.start for each delta - // - the key.end for each image - let keys_iter: Box> = { - let mut keys: Vec = self - .iter_historic_layers() - .map(|layer| { - if layer.is_incremental() { - layer.get_key_range().start - } else { - layer.get_key_range().end - } - }) - .collect(); - keys.sort(); - Box::new(keys.into_iter()) - }; - let mut keys_iter = keys_iter.peekable(); - - // Iter the partition and keys together and query all the necessary - // keys, computing the max difficulty for each part. - partitioning - .parts - .iter() - .map(|part| { - let mut difficulty = 0; - // Partition ranges are assumed to be sorted and disjoint - // TODO assert it - for range in &part.ranges { - if !range.is_empty() { - difficulty = - std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range)); - } - while let Some(key) = keys_iter.peek() { - if key >= &range.end { - break; - } - let key = keys_iter.next().unwrap(); - if key < range.start { - continue; - } - difficulty = - std::cmp::max(difficulty, self.get_difficulty(lsn, key, range)); - } - } - difficulty - }) - .collect() - } - - /// For each part of a keyspace partitioning, return the maximum number of layers - /// that would be needed for page reconstruction in that part at the given LSN. - /// - /// If `limit` is provided we don't try to count above that number. - /// - /// This method is used to decide where to create new image layers. Computing the - /// result for the entire partitioning at once allows this function to be more - /// efficient, and further optimization is possible by using iterators instead, - /// to allow early return. - /// - /// TODO actually use this method instead of count_deltas. Currently we only use - /// it for benchmarks. - pub fn get_difficulty_map( - &self, - lsn: Lsn, - partitioning: &KeyPartitioning, - limit: Option, - ) -> Vec { - // TODO This is a naive implementation. Perf improvements to do: - // 1. Instead of calling self.image_coverage and self.count_deltas, - // iterate the image and delta coverage only once. - partitioning - .parts - .iter() - .map(|part| { - let mut difficulty = 0; - for range in &part.ranges { - if limit == Some(difficulty) { - break; - } - for (img_range, last_img) in self.image_coverage(range, lsn) { - if limit == Some(difficulty) { - break; - } - let img_lsn = if let Some(last_img) = last_img { - last_img.get_lsn_range().end - } else { - Lsn(0) - }; - - if img_lsn < lsn { - let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit); - difficulty = std::cmp::max(difficulty, num_deltas); - } - } - } - difficulty - }) - .collect() - } - /// Return all L0 delta layers pub fn level0_deltas(&self) -> &Vec> { &self.l0_delta_layers @@ -1069,6 +1106,10 @@ mod tests { use std::collections::HashMap; use std::path::PathBuf; + use crate::{ + DEFAULT_PG_VERSION, + tenant::{harness::TenantHarness, storage_layer::LayerName}, + }; use pageserver_api::key::DBDIR_KEY; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use utils::id::{TenantId, TimelineId}; @@ -1076,7 +1117,6 @@ mod tests { use super::*; use crate::tenant::IndexPart; - use crate::tenant::storage_layer::LayerName; #[derive(Clone)] struct LayerDesc { @@ -1101,7 +1141,6 @@ mod tests { } fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) { - assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace()); let lhs: HashMap = lhs .found .into_iter() @@ -1127,17 +1166,12 @@ mod tests { let mut key = key_range.start; while key != key_range.end { let res = layer_map.search(key, end_lsn); - match res { - Some(res) => { - range_search_result - .found - .entry(res) - .or_default() - .add_key(key); - } - None => { - range_search_result.not_found.add_key(key); - } + if let Some(res) = res { + range_search_result + .found + .entry(res) + .or_default() + .add_key(key); } key = key.next(); @@ -1152,20 +1186,49 @@ mod tests { let range = Key::from_i128(100)..Key::from_i128(200); let res = layer_map.range_search(range.clone(), Lsn(100)); - assert_eq!( - res.not_found.to_keyspace(), - KeySpace { - ranges: vec![range] - } - ); + assert_range_search_result_eq(res, RangeSearchResult::new()); } - #[test] - fn ranged_search() { + #[tokio::test] + async fn ranged_search() { + let harness = TenantHarness::create("ranged_search").await.unwrap(); + let (tenant, ctx) = harness.load().await; + let timeline_id = TimelineId::generate(); + // Create the timeline such that the in-memory layers can be written + // to the timeline directory. + tenant + .create_test_timeline(timeline_id, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let gate = utils::sync::gate::Gate::default(); + let add_in_memory_layer = async |layer_map: &mut LayerMap, lsn_range: Range| { + let layer = InMemoryLayer::create( + harness.conf, + timeline_id, + harness.tenant_shard_id, + lsn_range.start, + &gate, + &ctx, + ) + .await + .unwrap(); + + layer.freeze(lsn_range.end).await; + + layer_map.frozen_layers.push_back(Arc::new(layer)); + }; + + let in_memory_layer_configurations = [ + vec![], + // Overlaps with the top-most image + vec![Lsn(35)..Lsn(50)], + ]; + let layers = vec![ LayerDesc { key_range: Key::from_i128(15)..Key::from_i128(50), - lsn_range: Lsn(0)..Lsn(5), + lsn_range: Lsn(5)..Lsn(6), is_delta: false, }, LayerDesc { @@ -1185,19 +1248,27 @@ mod tests { }, LayerDesc { key_range: Key::from_i128(35)..Key::from_i128(40), - lsn_range: Lsn(35)..Lsn(40), + lsn_range: Lsn(40)..Lsn(41), is_delta: false, }, ]; - let layer_map = create_layer_map(layers.clone()); - for start in 0..60 { - for end in (start + 1)..60 { - let range = Key::from_i128(start)..Key::from_i128(end); - let result = layer_map.range_search(range.clone(), Lsn(100)); - let expected = brute_force_range_search(&layer_map, range, Lsn(100)); + let mut layer_map = create_layer_map(layers.clone()); + for in_memory_layers in in_memory_layer_configurations { + for in_mem_layer_range in in_memory_layers { + add_in_memory_layer(&mut layer_map, in_mem_layer_range).await; + } - assert_range_search_result_eq(result, expected); + for start in 0..60 { + for end in (start + 1)..60 { + let range = Key::from_i128(start)..Key::from_i128(end); + let result = layer_map.range_search(range.clone(), Lsn(100)); + let expected = brute_force_range_search(&layer_map, range, Lsn(100)); + + eprintln!("{start}..{end}: {result:?}"); + + assert_range_search_result_eq(result, expected); + } } } } @@ -1490,12 +1561,348 @@ mod tests { // Sanity: the layer that holds latest data for the DBDIR key should always be visible // (just using this key as a key that will always exist for any layermap fixture) - let dbdir_layer = layer_map - .search(DBDIR_KEY, index.metadata.disk_consistent_lsn()) - .unwrap(); + let dbdir_layer = { + let readable_layer = layer_map + .search(DBDIR_KEY, index.metadata.disk_consistent_lsn()) + .unwrap(); + + match readable_layer.layer { + ReadableLayerWeak::PersistentLayer(desc) => desc, + ReadableLayerWeak::InMemoryLayer(_) => unreachable!(""), + } + }; assert!(matches!( - layer_visibilities.get(&dbdir_layer.layer).unwrap(), + layer_visibilities.get(&dbdir_layer).unwrap(), LayerVisibilityHint::Visible )); } } + +#[cfg(test)] +mod select_layer_tests { + use super::*; + + fn create_persistent_layer( + start_lsn: u64, + end_lsn: u64, + is_delta: bool, + ) -> Arc { + if !is_delta { + assert_eq!(end_lsn, start_lsn + 1); + } + + Arc::new(PersistentLayerDesc::new_test( + Key::MIN..Key::MAX, + Lsn(start_lsn)..Lsn(end_lsn), + is_delta, + )) + } + + fn create_inmem_layer(start_lsn: u64, end_lsn: u64) -> InMemoryLayerDesc { + InMemoryLayerDesc { + handle: InMemoryLayerHandle::Open, + lsn_range: Lsn(start_lsn)..Lsn(end_lsn), + } + } + + #[test] + fn test_select_layer_empty() { + assert!(LayerMap::select_layer(None, None, None, Lsn(100)).is_none()); + } + + #[test] + fn test_select_layer_only_delta() { + let delta = create_persistent_layer(10, 20, true); + let result = LayerMap::select_layer(Some(delta.clone()), None, None, Lsn(100)).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + } + + #[test] + fn test_select_layer_only_image() { + let image = create_persistent_layer(10, 11, false); + let result = LayerMap::select_layer(None, Some(image.clone()), None, Lsn(100)).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_only_inmem() { + let inmem = create_inmem_layer(10, 20); + let result = LayerMap::select_layer(None, None, Some(inmem.clone()), Lsn(100)).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + } + + #[test] + fn test_select_layer_image_inside_delta() { + let delta = create_persistent_layer(10, 20, true); + let image = create_persistent_layer(15, 16, false); + + let result = + LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(100)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(16)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + None, + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_newer_image() { + let delta = create_persistent_layer(10, 20, true); + let image = create_persistent_layer(25, 26, false); + + let result = + LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(25)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + + let result = + LayerMap::select_layer(Some(delta.clone()), None, None, result.lsn_floor).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + } + + #[test] + fn test_select_layer_delta_with_older_image() { + let delta = create_persistent_layer(15, 25, true); + let image = create_persistent_layer(10, 11, false); + + let result = + LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = + LayerMap::select_layer(None, Some(image.clone()), None, result.lsn_floor).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_image_inside_inmem() { + let image = create_persistent_layer(15, 16, false); + let inmem = create_inmem_layer(10, 25); + + let result = + LayerMap::select_layer(None, Some(image.clone()), Some(inmem.clone()), Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(16)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + None, + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + + let result = + LayerMap::select_layer(None, None, Some(inmem.clone()), result.lsn_floor).unwrap(); + assert_eq!(result.lsn_floor, Lsn(10)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + } + + #[test] + fn test_select_layer_delta_inside_inmem() { + let delta_top = create_persistent_layer(15, 20, true); + let delta_bottom = create_persistent_layer(10, 15, true); + let inmem = create_inmem_layer(15, 25); + + let result = + LayerMap::select_layer(Some(delta_top.clone()), None, Some(inmem.clone()), Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(20)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + Some(delta_top.clone()), + None, + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_top)) + ); + + let result = LayerMap::select_layer( + Some(delta_bottom.clone()), + None, + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_bottom)) + ); + } + + #[test] + fn test_select_layer_all_overlap_1() { + let inmem = create_inmem_layer(10, 30); + let delta = create_persistent_layer(15, 25, true); + let image = create_persistent_layer(20, 21, false); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + Lsn(50), + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(25)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(21)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(20)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_all_overlap_2() { + let inmem = create_inmem_layer(20, 30); + let delta = create_persistent_layer(10, 40, true); + let image = create_persistent_layer(25, 26, false); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + Lsn(50), + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(26)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(25)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_all_overlap_3() { + let inmem = create_inmem_layer(30, 40); + let delta = create_persistent_layer(10, 30, true); + let image = create_persistent_layer(20, 21, false); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + Lsn(50), + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(30)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + None, + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(21)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + None, + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(20)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } +} diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index f8bec48886..b3dc8e56a3 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -63,6 +63,8 @@ pub struct HistoricLayerCoverage { /// The latest state head: LayerCoverageTuple, + /// TODO: this could be an ordered vec using binary search. + /// We push into this map everytime we add a layer, so might see some benefit /// All previous states historic: BTreeMap>, } @@ -419,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage { buffer: BTreeMap>, /// All current layers. This is not used for search. Only to make rebuilds easier. + // TODO: This map is never cleared. Rebuilds could use the post-trim last entry of + // [`Self::historic_coverage`] instead of doubling memory usage. + // [`Self::len`]: can require rebuild and serve from latest historic + // [`Self::iter`]: already requires rebuild => can serve from latest historic layers: BTreeMap, } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 4ba5844fea..891760b499 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -194,7 +194,7 @@ pub(crate) use download::{ }; use index::GcCompactionState; pub(crate) use index::LayerFileMetadata; -use pageserver_api::models::TimelineArchivalState; +use pageserver_api::models::{RelSizeMigration, TimelineArchivalState}; use pageserver_api::shard::{ShardIndex, TenantShardId}; use regex::Regex; use remote_storage::{ @@ -437,9 +437,13 @@ impl RemoteTimelineClient { /// Initialize the upload queue for the case where the remote storage is empty, /// i.e., it doesn't have an `IndexPart`. + /// + /// `rel_size_v2_status` needs to be carried over during branching, and that's why + /// it's passed in here. pub fn init_upload_queue_for_empty_remote( &self, local_metadata: &TimelineMetadata, + rel_size_v2_status: Option, ) -> anyhow::Result<()> { // Set the maximum number of inprogress tasks to the remote storage concurrency. There's // certainly no point in starting more upload tasks than this. @@ -449,7 +453,9 @@ impl RemoteTimelineClient { .as_ref() .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?; + let initialized_queue = + upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?; + initialized_queue.dirty.rel_size_migration = rel_size_v2_status; self.update_remote_physical_size_gauge(None); info!("initialized upload queue as empty"); Ok(()) @@ -900,7 +906,7 @@ impl RemoteTimelineClient { Ok(()) } - /// Launch an index-file upload operation in the background, setting `import_pgdata` field. + /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field. pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( self: &Arc, gc_compaction_state: GcCompactionState, @@ -912,6 +918,21 @@ impl RemoteTimelineClient { Ok(()) } + /// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field. + pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update( + self: &Arc, + rel_size_v2_status: RelSizeMigration, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status); + // TODO: allow this operation to bypass the validation check because we might upload the index part + // with no layers but the flag updated. For now, we just modify the index part in memory and the next + // upload will include the flag. + // self.schedule_index_upload(upload_queue); + Ok(()) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -933,6 +954,14 @@ impl RemoteTimelineClient { Ok(()) } + /// Only used in the `patch_index_part` HTTP API to force trigger an index upload. + pub fn force_schedule_index_upload(self: &Arc) -> Result<(), NotInitialized> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + self.schedule_index_upload(upload_queue); + Ok(()) + } + /// Launch an index-file upload operation in the background (internal function) fn schedule_index_upload(self: &Arc, upload_queue: &mut UploadQueueInitialized) { let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index ceaed58bbd..16c38be907 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -7,6 +7,7 @@ use std::collections::HashMap; use chrono::NaiveDateTime; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::RelSizeMigration; use pageserver_api::shard::ShardIndex; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; @@ -117,21 +118,6 @@ pub struct GcCompactionState { pub(crate) last_completed_lsn: Lsn, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -pub enum RelSizeMigration { - /// The tenant is using the old rel_size format. - /// Note that this enum is persisted as `Option` in the index part, so - /// `None` is the same as `Some(RelSizeMigration::Legacy)`. - Legacy, - /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are - /// persisted in the index part. The read path will read both formats and merge them. - Migrating, - /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted - /// in the index part, and the read path will not read the old format. - Migrated, -} - impl IndexPart { /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be /// used to understand later versions. diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index a13b9323ac..5f3a0932c4 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -869,8 +869,7 @@ impl<'a> TenantDownloader<'a> { let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap(); let layers_in_heatmap = heatmap_timeline - .layers - .iter() + .hot_layers() .map(|l| (&l.name, l.metadata.generation)) .collect::>(); let layers_on_disk = timeline_state @@ -1015,7 +1014,8 @@ impl<'a> TenantDownloader<'a> { // Accumulate updates to the state let mut touched = Vec::new(); - for layer in timeline.layers { + let timeline_id = timeline.timeline_id; + for layer in timeline.into_hot_layers() { if self.secondary_state.cancel.is_cancelled() { tracing::debug!("Cancelled -- dropping out of layer loop"); return (Err(UpdateError::Cancelled), touched); @@ -1040,7 +1040,7 @@ impl<'a> TenantDownloader<'a> { } match self - .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx) + .download_layer(tenant_shard_id, &timeline_id, layer, ctx) .await { Ok(Some(layer)) => touched.push(layer), @@ -1148,7 +1148,7 @@ impl<'a> TenantDownloader<'a> { let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); let timeline_id = timeline.timeline_id; - tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); + tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count()); let (result, touched) = self .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx) @@ -1316,11 +1316,11 @@ async fn init_timeline_state( // As we iterate through layers found on disk, we will look up their metadata from this map. // Layers not present in metadata will be discarded. let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = - heatmap.layers.iter().map(|l| (&l.name, l)).collect(); + heatmap.hot_layers().map(|l| (&l.name, l)).collect(); let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = if let Some(last_heatmap) = last_heatmap { - last_heatmap.layers.iter().map(|l| (&l.name, l)).collect() + last_heatmap.hot_layers().map(|l| (&l.name, l)).collect() } else { HashMap::new() }; diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 4a938e9095..6dbb3f091f 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -42,7 +42,7 @@ pub(crate) struct HeatMapTimeline { #[serde_as(as = "DisplayFromStr")] pub(crate) timeline_id: TimelineId, - pub(crate) layers: Vec, + layers: Vec, } #[serde_as] @@ -53,8 +53,10 @@ pub(crate) struct HeatMapLayer { #[serde_as(as = "TimestampSeconds")] pub(crate) access_time: SystemTime, - // TODO: an actual 'heat' score that would let secondary locations prioritize downloading - // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. + + #[serde(default)] + pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading + // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. } impl HeatMapLayer { @@ -62,11 +64,13 @@ impl HeatMapLayer { name: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, + cold: bool, ) -> Self { Self { name, metadata, access_time, + cold, } } } @@ -78,6 +82,18 @@ impl HeatMapTimeline { layers, } } + + pub(crate) fn into_hot_layers(self) -> impl Iterator { + self.layers.into_iter().filter(|l| !l.cold) + } + + pub(crate) fn hot_layers(&self) -> impl Iterator { + self.layers.iter().filter(|l| !l.cold) + } + + pub(crate) fn all_layers(&self) -> impl Iterator { + self.layers.iter() + } } pub(crate) struct HeatMapStats { @@ -92,7 +108,7 @@ impl HeatMapTenant { layers: 0, }; for timeline in &self.timelines { - for layer in &timeline.layers { + for layer in timeline.hot_layers() { stats.layers += 1; stats.bytes += layer.metadata.file_size; } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 7f313f46a2..ece163b24a 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -40,6 +40,7 @@ use utils::sync::gate::GateGuard; use self::inmemory_layer::InMemoryLayerFileId; use super::PageReconstructError; +use super::layer_map::InMemoryLayerDesc; use super::timeline::{GetVectoredError, ReadPath}; use crate::config::PageServerConf; use crate::context::{AccessStatsBehavior, RequestContext}; @@ -721,6 +722,12 @@ struct LayerToVisitId { lsn_floor: Lsn, } +#[derive(Debug, PartialEq, Eq, Hash)] +pub enum ReadableLayerWeak { + PersistentLayer(Arc), + InMemoryLayer(InMemoryLayerDesc), +} + /// Layer wrapper for the read path. Note that it is valid /// to use these layers even after external operations have /// been performed on them (compaction, freeze, etc.). @@ -873,7 +880,7 @@ impl ReadableLayer { } ReadableLayer::InMemoryLayer(layer) => { layer - .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) .await } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index ffdfe1dc27..46135b5330 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -416,7 +416,7 @@ impl InMemoryLayer { pub(crate) async fn get_values_reconstruct_data( self: &Arc, keyspace: KeySpace, - end_lsn: Lsn, + lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { @@ -433,8 +433,6 @@ impl InMemoryLayer { let mut reads: HashMap> = HashMap::new(); let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); - let lsn_range = self.start_lsn..end_lsn; - for range in keyspace.ranges.iter() { for (key, vec_map) in inner .index diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index bde7fbc1f9..247092bf45 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1563,10 +1563,10 @@ impl LayerInner { self.access_stats.record_residence_event(); - self.status.as_ref().unwrap().send_replace(Status::Evicted); - *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now()); + self.status.as_ref().unwrap().send_replace(Status::Evicted); + Ok(()) } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index d43dfefdbc..a7f3c6b8c5 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -49,6 +49,7 @@ async fn smoke_test() { Lsn(0x10), 14, &ctx, + Default::default(), // in-memory layers Default::default(), image_layers, Lsn(0x100), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 662088fbde..4483ecfe94 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart use pageserver_api::models::{ CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState, + InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState, }; use pageserver_api::reltag::{BlockNumber, RelTag}; use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; @@ -99,7 +99,8 @@ use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::l0_flush::{self, L0FlushGlobalState}; use crate::metrics::{ - DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics, + DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL, + LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics, }; use crate::page_service::TenantManagerTypes; use crate::pgdatadir_mapping::{ @@ -436,12 +437,16 @@ pub struct Timeline { /// May host a background Tokio task which downloads all the layers from the current /// heatmap on demand. heatmap_layers_downloader: Mutex>, + + pub(crate) rel_size_v2_status: ArcSwapOption, } pub(crate) enum PreviousHeatmap { Active { heatmap: HeatMapTimeline, read_at: std::time::Instant, + // End LSN covered by the heatmap if known + end_lsn: Option, }, Obsolete, } @@ -1326,10 +1331,6 @@ impl Timeline { // (this is a requirement, not a bug). Skip updating the metric in these cases // to avoid infinite results. if !results.is_empty() { - // Record the total number of layers visited towards each key in the batch. While some - // layers may not intersect with a given read, and the cost of layer visits are - // amortized across the batch, each visited layer contributes directly to the observed - // latency for every read in the batch, which is what we care about. if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD { static LOG_PACER: Lazy> = Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); @@ -1344,9 +1345,23 @@ impl Timeline { }); } + // Records the number of layers visited in a few different ways: + // + // * LAYERS_PER_READ: all layers count towards every read in the batch, because each + // layer directly affects its observed latency. + // + // * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch + // layer visits and access cost. + // + // * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized + // read amplification after batching. + let layers_visited = layers_visited as f64; + let avg_layers_visited = layers_visited / results.len() as f64; + LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited); for _ in &results { - self.metrics.layers_per_read.observe(layers_visited as f64); - LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64); + self.metrics.layers_per_read.observe(layers_visited); + LAYERS_PER_READ_GLOBAL.observe(layers_visited); + LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited); } } @@ -2366,6 +2381,9 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path + /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is + /// possible that the index part persists the state while the config doesn't get persisted. pub(crate) fn get_rel_size_v2_enabled(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2374,6 +2392,14 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) } + pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration { + self.rel_size_v2_status + .load() + .as_ref() + .map(|s| s.as_ref().clone()) + .unwrap_or(RelSizeMigration::Legacy) + } + fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2634,6 +2660,7 @@ impl Timeline { attach_wal_lag_cooldown: Arc>, create_idempotency: crate::tenant::CreateTimelineIdempotency, gc_compaction_state: Option, + rel_size_v2_status: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -2792,6 +2819,8 @@ impl Timeline { previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap), heatmap_layers_downloader: Mutex::new(None), + + rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status), }; result.repartition_threshold = @@ -2868,6 +2897,16 @@ impl Timeline { .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state) } + pub(crate) fn update_rel_size_v2_status( + &self, + rel_size_v2_status: RelSizeMigration, + ) -> anyhow::Result<()> { + self.rel_size_v2_status + .store(Some(Arc::new(rel_size_v2_status.clone()))); + self.remote_client + .schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status) + } + pub(crate) fn get_gc_compaction_state(&self) -> Option { self.gc_compaction_state.load_full().as_ref().clone() } @@ -3570,12 +3609,16 @@ impl Timeline { Ok(layer) } - pub(super) fn is_previous_heatmap_active(&self) -> bool { - self.previous_heatmap - .load() - .as_ref() - .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. })) - .unwrap_or(false) + pub(super) fn should_keep_previous_heatmap(&self, new_heatmap_end_lsn: Lsn) -> bool { + let crnt = self.previous_heatmap.load(); + match crnt.as_deref() { + Some(PreviousHeatmap::Active { end_lsn, .. }) => match end_lsn { + Some(crnt_end_lsn) => *crnt_end_lsn > new_heatmap_end_lsn, + None => true, + }, + Some(PreviousHeatmap::Obsolete) => false, + None => false, + } } /// The timeline heatmap is a hint to secondary locations from the primary location, @@ -3603,26 +3646,26 @@ impl Timeline { // heatamp. let previous_heatmap = self.previous_heatmap.load(); let visible_non_resident = match previous_heatmap.as_deref() { - Some(PreviousHeatmap::Active { heatmap, read_at }) => { - Some(heatmap.layers.iter().filter_map(|hl| { - let desc: PersistentLayerDesc = hl.name.clone().into(); - let layer = guard.try_get_from_key(&desc.key())?; + Some(PreviousHeatmap::Active { + heatmap, read_at, .. + }) => Some(heatmap.all_layers().filter_map(|hl| { + let desc: PersistentLayerDesc = hl.name.clone().into(); + let layer = guard.try_get_from_key(&desc.key())?; - if layer.visibility() == LayerVisibilityHint::Covered { - return None; - } + if layer.visibility() == LayerVisibilityHint::Covered { + return None; + } - if layer.is_likely_resident() { - return None; - } + if layer.is_likely_resident() { + return None; + } - if layer.last_evicted_at().happened_after(*read_at) { - return None; - } + if layer.last_evicted_at().happened_after(*read_at) { + return None; + } - Some((desc, hl.metadata.clone(), hl.access_time)) - })) - } + Some((desc, hl.metadata.clone(), hl.access_time, hl.cold)) + })), Some(PreviousHeatmap::Obsolete) => None, None => None, }; @@ -3637,6 +3680,7 @@ impl Timeline { layer.layer_desc().clone(), layer.metadata(), last_activity_ts, + false, // these layers are not cold )) } LayerVisibilityHint::Covered => { @@ -3663,12 +3707,14 @@ impl Timeline { // Sort layers in order of which to download first. For a large set of layers to download, we // want to prioritize those layers which are most likely to still be in the resident many minutes // or hours later: + // - Cold layers go last for convenience when a human inspects the heatmap. // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might // only exist for a few minutes before being compacted into L1s. // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner // the layer is likely to be covered by an image layer during compaction. - layers.sort_by_key(|(desc, _meta, _atime)| { + layers.sort_by_key(|(desc, _meta, _atime, cold)| { std::cmp::Reverse(( + *cold, !LayerMap::is_l0(&desc.key_range, desc.is_delta), desc.lsn_range.end, )) @@ -3676,7 +3722,9 @@ impl Timeline { let layers = layers .into_iter() - .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime)) + .map(|(desc, meta, atime, cold)| { + HeatMapLayer::new(desc.layer_name(), meta, atime, cold) + }) .collect(); Some(HeatMapTimeline::new(self.timeline_id, layers)) @@ -3696,6 +3744,7 @@ impl Timeline { name: vl.layer_desc().layer_name(), metadata: vl.metadata(), access_time: now, + cold: true, }; heatmap_layers.push(hl); } @@ -3709,6 +3758,7 @@ impl Timeline { PreviousHeatmap::Active { heatmap, read_at: Instant::now(), + end_lsn: Some(end_lsn), } } @@ -3907,39 +3957,22 @@ impl Timeline { let guard = timeline.layers.read().await; let layers = guard.layer_map()?; - let in_memory_layer = layers.find_in_memory_layer(|l| { - let start_lsn = l.get_lsn_range().start; - cont_lsn > start_lsn - }); + for range in unmapped_keyspace.ranges.iter() { + let results = layers.range_search(range.clone(), cont_lsn); - match in_memory_layer { - Some(l) => { - let lsn_range = l.get_lsn_range().start..cont_lsn; - fringe.update( - ReadableLayer::InMemoryLayer(l), - unmapped_keyspace.clone(), - lsn_range, - ); - } - None => { - for range in unmapped_keyspace.ranges.iter() { - let results = layers.range_search(range.clone(), cont_lsn); - - results - .found - .into_iter() - .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { - ( - ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)), - keyspace_accum.to_keyspace(), - lsn_floor..cont_lsn, - ) - }) - .for_each(|(layer, keyspace, lsn_range)| { - fringe.update(layer, keyspace, lsn_range) - }); - } - } + results + .found + .into_iter() + .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { + ( + guard.upgrade(layer), + keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, + ) + }) + .for_each(|(layer, keyspace, lsn_range)| { + fringe.update(layer, keyspace, lsn_range) + }); } // It's safe to drop the layer map lock after planning the next round of reads. @@ -5548,6 +5581,14 @@ pub struct DeltaLayerTestDesc { pub data: Vec<(Key, Lsn, Value)>, } +#[cfg(test)] +#[derive(Clone)] +pub struct InMemoryLayerTestDesc { + pub lsn_range: Range, + pub data: Vec<(Key, Lsn, Value)>, + pub is_open: bool, +} + #[cfg(test)] impl DeltaLayerTestDesc { pub fn new(lsn_range: Range, key_range: Range, data: Vec<(Key, Lsn, Value)>) -> Self { @@ -6560,6 +6601,92 @@ impl Timeline { Ok(()) } + /// Force create an in-memory layer and place them into the layer map. + #[cfg(test)] + pub(super) async fn force_create_in_memory_layer( + self: &Arc, + mut in_memory: InMemoryLayerTestDesc, + check_start_lsn: Option, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + use utils::bin_ser::BeSer; + + // Validate LSNs + if let Some(check_start_lsn) = check_start_lsn { + assert!(in_memory.lsn_range.start >= check_start_lsn); + } + + let last_record_lsn = self.get_last_record_lsn(); + let layer_end_lsn = if in_memory.is_open { + in_memory + .data + .iter() + .map(|(_key, lsn, _value)| lsn) + .max() + .cloned() + } else { + Some(in_memory.lsn_range.end) + }; + + if let Some(end) = layer_end_lsn { + assert!( + end <= last_record_lsn, + "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}", + end, + last_record_lsn, + ); + } + + in_memory.data.iter().for_each(|(_key, lsn, _value)| { + assert!(*lsn >= in_memory.lsn_range.start); + assert!(*lsn < in_memory.lsn_range.end); + }); + + // Build the batch + in_memory + .data + .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); + + let data = in_memory + .data + .into_iter() + .map(|(key, lsn, value)| { + let value_size = value.serialized_size().unwrap() as usize; + (key.to_compact(), lsn, value_size, value) + }) + .collect::>(); + + let batch = SerializedValueBatch::from_values(data); + + // Create the in-memory layer and write the batch into it + let layer = InMemoryLayer::create( + self.conf, + self.timeline_id, + self.tenant_shard_id, + in_memory.lsn_range.start, + &self.gate, + ctx, + ) + .await + .unwrap(); + + layer.put_batch(batch, ctx).await.unwrap(); + if !in_memory.is_open { + layer.freeze(in_memory.lsn_range.end).await; + } + + info!("force created in-memory layer {:?}", in_memory.lsn_range); + + // Link the layer to the layer map + { + let mut guard = self.layers.write().await; + let layer_map = guard.open_mut().unwrap(); + layer_map.force_insert_in_memory_layer(Arc::new(layer)); + } + + Ok(()) + } + /// Return all keys at the LSN in the image layers #[cfg(test)] pub(crate) async fn inspect_image_layers( @@ -6919,6 +7046,7 @@ mod tests { use pageserver_api::key::Key; use pageserver_api::value::Value; + use std::iter::Iterator; use tracing::Instrument; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -6932,8 +7060,8 @@ mod tests { use crate::tenant::{PreviousHeatmap, Timeline}; fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) { - assert_eq!(lhs.layers.len(), rhs.layers.len()); - let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter()); + assert_eq!(lhs.all_layers().count(), rhs.all_layers().count()); + let lhs_rhs = lhs.all_layers().zip(rhs.all_layers()); for (l, r) in lhs_rhs { assert_eq!(l.name, r.name); assert_eq!(l.metadata, r.metadata); @@ -6992,6 +7120,7 @@ mod tests { Lsn(0x10), 14, &ctx, + Vec::new(), // in-memory layers delta_layers, image_layers, Lsn(0x100), @@ -7010,10 +7139,11 @@ mod tests { assert_eq!(heatmap.timeline_id, timeline.timeline_id); // L0 should come last - assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name()); + let heatmap_layers = heatmap.all_layers().collect::>(); + assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name()); let mut last_lsn = Lsn::MAX; - for layer in &heatmap.layers { + for layer in heatmap_layers { // Covered layer should be omitted assert!(layer.name != covered_delta.layer_name()); @@ -7046,6 +7176,7 @@ mod tests { .store(Some(Arc::new(PreviousHeatmap::Active { heatmap: heatmap.clone(), read_at: std::time::Instant::now(), + end_lsn: None, }))); // Generate a new heatmap and assert that it contains the same layers as the old one. @@ -7124,6 +7255,7 @@ mod tests { Lsn(0x10), 14, &ctx, + Vec::new(), // in-memory layers delta_layers, image_layers, Lsn(0x100), @@ -7140,7 +7272,7 @@ mod tests { .expect("Infallible while timeline is not shut down"); // Both layers should be in the heatmap - assert!(!heatmap.layers.is_empty()); + assert!(heatmap.all_layers().count() > 0); // Now simulate a migration. timeline @@ -7148,6 +7280,7 @@ mod tests { .store(Some(Arc::new(PreviousHeatmap::Active { heatmap: heatmap.clone(), read_at: std::time::Instant::now(), + end_lsn: None, }))); // Evict all the layers in the previous heatmap @@ -7165,7 +7298,7 @@ mod tests { .await .expect("Infallible while timeline is not shut down"); - assert!(post_eviction_heatmap.layers.is_empty()); + assert_eq!(post_eviction_heatmap.all_layers().count(), 0); assert!(matches!( timeline.previous_heatmap.load().as_deref(), Some(PreviousHeatmap::Obsolete) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index c835980a7d..42b36f7252 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -7,6 +7,7 @@ use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::ops::{Deref, Range}; use std::sync::Arc; +use std::time::Instant; use super::layer_manager::LayerManager; use super::{ @@ -15,10 +16,11 @@ use super::{ Timeline, }; -use anyhow::{Context, anyhow, bail}; +use anyhow::{Context, anyhow}; use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; +use futures::FutureExt; use itertools::Itertools; use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; @@ -234,6 +236,12 @@ impl GcCompactionQueue { // it enough in staging yet. return Ok(()); } + if timeline.get_gc_compaction_watermark() == Lsn::INVALID { + // If the gc watermark is not set, we don't need to trigger auto compaction. + // This check is the same as in `gc_compaction_split_jobs` but we don't log + // here and we can also skip the computation of the trigger condition earlier. + return Ok(()); + } let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else { // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure @@ -321,7 +329,7 @@ impl GcCompactionQueue { l1_size, l2_size, l2_lsn, gc_cutoff ); } else { - info!( + debug!( "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", l1_size, l2_size, l2_lsn, gc_cutoff ); @@ -357,8 +365,7 @@ impl GcCompactionQueue { GcCompactJob::from_compact_options(options.clone()), options.sub_compaction_max_job_size_mb, ) - .await - .map_err(CompactionError::Other)?; + .await?; if jobs.is_empty() { info!("no jobs to run, skipping scheduled compaction task"); self.notify_and_unblock(id); @@ -437,6 +444,7 @@ impl GcCompactionQueue { )); }; let has_pending_tasks; + let mut yield_for_l0 = false; let Some((id, item)) = ({ let mut guard = self.inner.lock().unwrap(); if let Some((id, item)) = guard.queued.pop_front() { @@ -486,13 +494,23 @@ impl GcCompactionQueue { let mut guard = self.inner.lock().unwrap(); guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); } - let _ = timeline.compact_with_options(cancel, options, ctx).await?; + let compaction_result = + timeline.compact_with_options(cancel, options, ctx).await?; self.notify_and_unblock(id); + if compaction_result == CompactionOutcome::YieldForL0 { + yield_for_l0 = true; + } } } GcCompactionQueueItem::SubCompactionJob(options) => { // TODO: error handling, clear the queue if any task fails? - let _ = timeline.compact_with_options(cancel, options, ctx).await?; + let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?; + if compaction_result == CompactionOutcome::YieldForL0 { + // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running + // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because + // we need to clean things up before returning from the function. + yield_for_l0 = true; + } } GcCompactionQueueItem::Notify(id, l2_lsn) => { self.notify_and_unblock(id); @@ -521,7 +539,10 @@ impl GcCompactionQueue { let mut guard = self.inner.lock().unwrap(); guard.running = None; } - Ok(if has_pending_tasks { + Ok(if yield_for_l0 { + tracing::info!("give up gc-compaction: yield for L0 compaction"); + CompactionOutcome::YieldForL0 + } else if has_pending_tasks { CompactionOutcome::Pending } else { CompactionOutcome::Done @@ -719,17 +740,41 @@ struct CompactionStatisticsNumSize { #[derive(Debug, Serialize, Default)] pub struct CompactionStatistics { + /// Delta layer visited (maybe compressed, physical size) delta_layer_visited: CompactionStatisticsNumSize, + /// Image layer visited (maybe compressed, physical size) image_layer_visited: CompactionStatisticsNumSize, + /// Delta layer produced (maybe compressed, physical size) delta_layer_produced: CompactionStatisticsNumSize, + /// Image layer produced (maybe compressed, physical size) image_layer_produced: CompactionStatisticsNumSize, - num_delta_layer_discarded: usize, - num_image_layer_discarded: usize, + /// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer) + delta_layer_discarded: CompactionStatisticsNumSize, + /// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer) + image_layer_discarded: CompactionStatisticsNumSize, num_unique_keys_visited: usize, + /// Delta visited (uncompressed, original size) wal_keys_visited: CompactionStatisticsNumSize, + /// Image visited (uncompressed, original size) image_keys_visited: CompactionStatisticsNumSize, + /// Delta produced (uncompressed, original size) wal_produced: CompactionStatisticsNumSize, + /// Image produced (uncompressed, original size) image_produced: CompactionStatisticsNumSize, + + // Time spent in each phase + time_acquire_lock_secs: f64, + time_analyze_secs: f64, + time_download_layer_secs: f64, + time_main_loop_secs: f64, + time_final_phase_secs: f64, + time_total_secs: f64, + + // Summary + /// Ratio of the key-value size before/after gc-compaction. + uncompressed_size_ratio: f64, + /// Ratio of the physical size before/after gc-compaction. + physical_size_ratio: f64, } impl CompactionStatistics { @@ -779,11 +824,13 @@ impl CompactionStatistics { self.image_produced.num += 1; self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64; } - fn discard_delta_layer(&mut self) { - self.num_delta_layer_discarded += 1; + fn discard_delta_layer(&mut self, original_size: u64) { + self.delta_layer_discarded.num += 1; + self.delta_layer_discarded.size += original_size; } - fn discard_image_layer(&mut self) { - self.num_image_layer_discarded += 1; + fn discard_image_layer(&mut self, original_size: u64) { + self.image_layer_discarded.num += 1; + self.image_layer_discarded.size += original_size; } fn produce_delta_layer(&mut self, size: u64) { self.delta_layer_produced.num += 1; @@ -793,6 +840,19 @@ impl CompactionStatistics { self.image_layer_produced.num += 1; self.image_layer_produced.size += size; } + fn finalize(&mut self) { + let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size; + let produced_key_value_size = self.image_produced.size + self.wal_produced.size; + self.uncompressed_size_ratio = + original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0 + let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size; + let produced_physical_size = self.image_layer_produced.size + + self.delta_layer_produced.size + + self.image_layer_discarded.size + + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate + self.physical_size_ratio = + original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0 + } } #[derive(Default, Debug, Clone, Copy, PartialEq, Eq)] @@ -825,9 +885,7 @@ impl Timeline { .flags .contains(CompactFlags::EnhancedGcBottomMostCompaction) { - self.compact_with_gc(cancel, options, ctx) - .await - .map_err(CompactionError::Other)?; + self.compact_with_gc(cancel, options, ctx).await?; return Ok(CompactionOutcome::Done); } @@ -2345,12 +2403,19 @@ impl Timeline { async fn check_compaction_space( self: &Arc, layer_selection: &[Layer], - ) -> anyhow::Result<()> { - let available_space = self.check_available_space().await?; + ) -> Result<(), CompactionError> { + let available_space = self + .check_available_space() + .await + .map_err(CompactionError::Other)?; let mut remote_layer_size = 0; let mut all_layer_size = 0; for layer in layer_selection { - let needs_download = layer.needs_download().await?; + let needs_download = layer + .needs_download() + .await + .context("failed to check if layer needs download") + .map_err(CompactionError::Other)?; if needs_download.is_some() { remote_layer_size += layer.layer_desc().file_size; } @@ -2359,14 +2424,14 @@ impl Timeline { let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */ if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space { - return Err(anyhow!( + return Err(CompactionError::Other(anyhow!( "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size - )); + ))); } Ok(()) } @@ -2397,7 +2462,7 @@ impl Timeline { self: &Arc, job: GcCompactJob, sub_compaction_max_job_size_mb: Option, - ) -> anyhow::Result> { + ) -> Result, CompactionError> { let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX { job.compact_lsn_range.end } else { @@ -2548,7 +2613,7 @@ impl Timeline { cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result { let sub_compaction = options.sub_compaction; let job = GcCompactJob::from_compact_options(options.clone()); if sub_compaction { @@ -2570,7 +2635,7 @@ impl Timeline { if jobs_len == 0 { info!("no jobs to run, skipping gc bottom-most compaction"); } - return Ok(()); + return Ok(CompactionOutcome::Done); } self.compact_with_gc_inner(cancel, job, ctx).await } @@ -2580,19 +2645,24 @@ impl Timeline { cancel: &CancellationToken, job: GcCompactJob, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result { // Block other compaction/GC tasks from running for now. GC-compaction could run along // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. // Note that we already acquired the compaction lock when the outer `compact` function gets called. + let timer = Instant::now(); + let begin_timer = timer; + let gc_lock = async { tokio::select! { guard = self.gc_lock.lock() => Ok(guard), - // TODO: refactor to CompactionError to correctly pass cancelled error - _ = cancel.cancelled() => Err(anyhow!("cancelled")), + _ = cancel.cancelled() => Err(CompactionError::ShuttingDown), } }; + let time_acquire_lock = timer.elapsed(); + let timer = Instant::now(); + let gc_lock = crate::timed( gc_lock, "acquires gc lock", @@ -2644,7 +2714,7 @@ impl Timeline { tracing::warn!( "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" ); - return Ok(()); + return Ok(CompactionOutcome::Skipped); } real_gc_cutoff } else { @@ -2682,7 +2752,7 @@ impl Timeline { "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff ); - return Ok(()); + return Ok(CompactionOutcome::Done); }; // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if @@ -2703,7 +2773,7 @@ impl Timeline { "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end ); - return Ok(()); + return Ok(CompactionOutcome::Done); }; // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key // layers to compact. @@ -2729,7 +2799,7 @@ impl Timeline { "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end ); - return Ok(()); + return Ok(CompactionOutcome::Done); } retain_lsns_below_horizon.sort(); GcCompactionJobDescription { @@ -2782,6 +2852,9 @@ impl Timeline { has_data_below, ); + let time_analyze = timer.elapsed(); + let timer = Instant::now(); + for layer in &job_desc.selected_layers { debug!("read layer: {}", layer.layer_desc().key()); } @@ -2810,10 +2883,10 @@ impl Timeline { .map(|layer| layer.layer_desc().layer_name()) .collect_vec(); if let Some(err) = check_valid_layermap(&layer_names) { - bail!( + return Err(CompactionError::Other(anyhow!( "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err - ); + ))); } // The maximum LSN we are processing in this compaction loop let end_lsn = job_desc @@ -2828,11 +2901,33 @@ impl Timeline { let mut total_downloaded_size = 0; let mut total_layer_size = 0; for layer in &job_desc.selected_layers { - if layer.needs_download().await?.is_some() { + if layer + .needs_download() + .await + .context("failed to check if layer needs download") + .map_err(CompactionError::Other)? + .is_some() + { total_downloaded_size += layer.layer_desc().file_size; } total_layer_size += layer.layer_desc().file_size; - let resident_layer = layer.download_and_keep_resident(ctx).await?; + if cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } + let should_yield = self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers"); + return Ok(CompactionOutcome::YieldForL0); + } + let resident_layer = layer + .download_and_keep_resident(ctx) + .await + .context("failed to download and keep resident layer") + .map_err(CompactionError::Other)?; downloaded_layers.push(resident_layer); } info!( @@ -2843,19 +2938,36 @@ impl Timeline { ); for resident_layer in &downloaded_layers { if resident_layer.layer_desc().is_delta() { - let layer = resident_layer.get_as_delta(ctx).await?; + let layer = resident_layer + .get_as_delta(ctx) + .await + .context("failed to get delta layer") + .map_err(CompactionError::Other)?; delta_layers.push(layer); } else { - let layer = resident_layer.get_as_image(ctx).await?; + let layer = resident_layer + .get_as_image(ctx) + .await + .context("failed to get image layer") + .map_err(CompactionError::Other)?; image_layers.push(layer); } } - let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?; + let (dense_ks, sparse_ks) = self + .collect_gc_compaction_keyspace() + .await + .context("failed to collect gc compaction keyspace") + .map_err(CompactionError::Other)?; let mut merge_iter = FilterIterator::create( MergeIterator::create(&delta_layers, &image_layers, ctx), dense_ks, sparse_ks, - )?; + ) + .context("failed to create filter iterator") + .map_err(CompactionError::Other)?; + + let time_download_layer = timer.elapsed(); + let timer = Instant::now(); // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); @@ -2874,7 +2986,9 @@ impl Timeline { self.get_compaction_target_size(), ctx, ) - .await?, + .await + .context("failed to create image layer writer") + .map_err(CompactionError::Other)?, ) } else { None @@ -2887,7 +3001,9 @@ impl Timeline { lowest_retain_lsn..end_lsn, self.get_compaction_target_size(), ) - .await?; + .await + .context("failed to create delta layer writer") + .map_err(CompactionError::Other)?; #[derive(Default)] struct RewritingLayers { @@ -2927,9 +3043,28 @@ impl Timeline { // the key and LSN range are determined. However, to keep things simple here, we still // create this writer, and discard the writer in the end. - while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? { + let mut keys_processed = 0; + + while let Some(((key, lsn, val), desc)) = merge_iter + .next_with_trace() + .await + .context("failed to get next key-value pair") + .map_err(CompactionError::Other)? + { if cancel.is_cancelled() { - return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error + return Err(CompactionError::ShuttingDown); + } + keys_processed += 1; + if keys_processed % 1000 == 0 { + let should_yield = self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!("preempt gc-compaction in the main loop: too many L0 layers"); + return Ok(CompactionOutcome::YieldForL0); + } } if self.shard_identity.is_key_disposable(&key) { // If this shard does not need to store this key, simply skip it. @@ -2960,7 +3095,9 @@ impl Timeline { desc.lsn_range.clone(), ctx, ) - .await?, + .await + .context("failed to create delta layer writer") + .map_err(CompactionError::Other)?, ); } rewriter.before.as_mut().unwrap() @@ -2975,14 +3112,20 @@ impl Timeline { desc.lsn_range.clone(), ctx, ) - .await?, + .await + .context("failed to create delta layer writer") + .map_err(CompactionError::Other)?, ); } rewriter.after.as_mut().unwrap() } else { unreachable!() }; - rewriter.put_value(key, lsn, val, ctx).await?; + rewriter + .put_value(key, lsn, val, ctx) + .await + .context("failed to put value") + .map_err(CompactionError::Other)?; continue; } match val { @@ -3005,9 +3148,13 @@ impl Timeline { &job_desc.retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn) - .await?, + .await + .context("failed to get ancestor image") + .map_err(CompactionError::Other)?, ) - .await?; + .await + .context("failed to generate key retention") + .map_err(CompactionError::Other)?; retention .pipe_to( *last_key, @@ -3016,7 +3163,9 @@ impl Timeline { &mut stat, ctx, ) - .await?; + .await + .context("failed to pipe to delta layer writer") + .map_err(CompactionError::Other)?; accumulated_values.clear(); *last_key = key; accumulated_values.push((key, lsn, val)); @@ -3034,9 +3183,14 @@ impl Timeline { job_desc.gc_cutoff, &job_desc.retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, - get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?, + get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn) + .await + .context("failed to get ancestor image") + .map_err(CompactionError::Other)?, ) - .await?; + .await + .context("failed to generate key retention") + .map_err(CompactionError::Other)?; retention .pipe_to( last_key, @@ -3045,21 +3199,36 @@ impl Timeline { &mut stat, ctx, ) - .await?; + .await + .context("failed to pipe to delta layer writer") + .map_err(CompactionError::Other)?; // end: move the above part to the loop body + let time_main_loop = timer.elapsed(); + let timer = Instant::now(); + let mut rewrote_delta_layers = Vec::new(); for (key, writers) in delta_layer_rewriters { if let Some(delta_writer_before) = writers.before { let (desc, path) = delta_writer_before .finish(job_desc.compaction_key_range.start, ctx) - .await?; - let layer = Layer::finish_creating(self.conf, self, desc, &path)?; + .await + .context("failed to finish delta layer writer") + .map_err(CompactionError::Other)?; + let layer = Layer::finish_creating(self.conf, self, desc, &path) + .context("failed to finish creating delta layer") + .map_err(CompactionError::Other)?; rewrote_delta_layers.push(layer); } if let Some(delta_writer_after) = writers.after { - let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?; - let layer = Layer::finish_creating(self.conf, self, desc, &path)?; + let (desc, path) = delta_writer_after + .finish(key.key_range.end, ctx) + .await + .context("failed to finish delta layer writer") + .map_err(CompactionError::Other)?; + let layer = Layer::finish_creating(self.conf, self, desc, &path) + .context("failed to finish creating delta layer") + .map_err(CompactionError::Other)?; rewrote_delta_layers.push(layer); } } @@ -3074,7 +3243,9 @@ impl Timeline { let end_key = job_desc.compaction_key_range.end; writer .finish_with_discard_fn(self, ctx, end_key, discard) - .await? + .await + .context("failed to finish image layer writer") + .map_err(CompactionError::Other)? } else { drop(writer); Vec::new() @@ -3086,7 +3257,9 @@ impl Timeline { let produced_delta_layers = if !dry_run { delta_layer_writer .finish_with_discard_fn(self, ctx, discard) - .await? + .await + .context("failed to finish delta layer writer") + .map_err(CompactionError::Other)? } else { drop(delta_layer_writer); Vec::new() @@ -3098,6 +3271,13 @@ impl Timeline { let mut keep_layers = HashSet::new(); let produced_delta_layers_len = produced_delta_layers.len(); let produced_image_layers_len = produced_image_layers.len(); + + let layer_selection_by_key = job_desc + .selected_layers + .iter() + .map(|l| (l.layer_desc().key(), l.layer_desc().clone())) + .collect::>(); + for action in produced_delta_layers { match action { BatchWriterResult::Produced(layer) => { @@ -3111,8 +3291,16 @@ impl Timeline { if cfg!(debug_assertions) { info!("discarded delta layer: {}", l); } + if let Some(layer_desc) = layer_selection_by_key.get(&l) { + stat.discard_delta_layer(layer_desc.file_size()); + } else { + tracing::warn!( + "discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?", + l + ); + stat.discard_delta_layer(0); + } keep_layers.insert(l); - stat.discard_delta_layer(); } } } @@ -3121,6 +3309,9 @@ impl Timeline { "produced rewritten delta layer: {}", layer.layer_desc().key() ); + // For now, we include rewritten delta layer size in the "produce_delta_layer". We could + // make it a separate statistics in the future. + stat.produce_delta_layer(layer.layer_desc().file_size()); } compact_to.extend(rewrote_delta_layers); for action in produced_image_layers { @@ -3132,8 +3323,16 @@ impl Timeline { } BatchWriterResult::Discarded(l) => { debug!("discarded image layer: {}", l); + if let Some(layer_desc) = layer_selection_by_key.get(&l) { + stat.discard_image_layer(layer_desc.file_size()); + } else { + tracing::warn!( + "discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?", + l + ); + stat.discard_image_layer(0); + } keep_layers.insert(l); - stat.discard_image_layer(); } } } @@ -3166,7 +3365,9 @@ impl Timeline { &layer.layer_desc().key_range, &job_desc.compaction_key_range, ) { - bail!("violated constraint: image layer outside of compaction key range"); + return Err(CompactionError::Other(anyhow!( + "violated constraint: image layer outside of compaction key range" + ))); } if !fully_contains( &job_desc.compaction_key_range, @@ -3179,13 +3380,25 @@ impl Timeline { layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); + let time_final_phase = timer.elapsed(); + + stat.time_final_phase_secs = time_final_phase.as_secs_f64(); + stat.time_main_loop_secs = time_main_loop.as_secs_f64(); + stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64(); + stat.time_download_layer_secs = time_download_layer.as_secs_f64(); + stat.time_analyze_secs = time_analyze.as_secs_f64(); + stat.time_total_secs = begin_timer.elapsed().as_secs_f64(); + stat.finalize(); + info!( "gc-compaction statistics: {}", - serde_json::to_string(&stat)? + serde_json::to_string(&stat) + .context("failed to serialize gc-compaction statistics") + .map_err(CompactionError::Other)? ); if dry_run { - return Ok(()); + return Ok(CompactionOutcome::Done); } info!( @@ -3220,10 +3433,10 @@ impl Timeline { // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails. if let Some(err) = check_valid_layermap(&final_layers) { - bail!( + return Err(CompactionError::Other(anyhow!( "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err - ); + ))); } // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only @@ -3275,7 +3488,9 @@ impl Timeline { // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should // be batched into `schedule_compaction_update`. let disk_consistent_lsn = self.disk_consistent_lsn.load(); - self.schedule_uploads(disk_consistent_lsn, None)?; + self.schedule_uploads(disk_consistent_lsn, None) + .context("failed to schedule uploads") + .map_err(CompactionError::Other)?; // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead // of `compact_from`. let compact_from = { @@ -3302,7 +3517,7 @@ impl Timeline { drop(gc_lock); - Ok(()) + Ok(CompactionOutcome::Done) } } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 7cdc69e55f..c9666bb4e1 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -306,6 +306,7 @@ impl DeleteTimelineFlow { CreateTimelineCause::Delete, crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here None, // doesn't matter what we put here + None, // doesn't matter what we put here ) .context("create_timeline_struct")?; diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 67fb89c433..809b350f38 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -1,5 +1,4 @@ -//! An efficient way to keep the timeline gate open without preventing -//! timeline shutdown for longer than a single call to a timeline method. +//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`. //! //! # Motivation //! @@ -19,27 +18,32 @@ //! we hold the Timeline gate open while we're invoking the method on the //! Timeline object. //! -//! However, we want to avoid the overhead of entering the gate for every -//! method invocation. -//! -//! Further, for shard routing, we want to avoid calling the tenant manager to -//! resolve the shard for every request. Instead, we want to cache the -//! routing result so we can bypass the tenant manager for all subsequent requests -//! that get routed to that shard. +//! We want to avoid the overhead of doing, for each incoming request, +//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing) +//! - cloning the `Arc` out of the tenant manager so we can +//! release the mgr rwlock before doing any request processing work +//! - re-entering the Timeline gate for each Timeline method invocation. //! //! Regardless of how we accomplish the above, it should not //! prevent the Timeline from shutting down promptly. //! +//! //! # Design //! //! ## Data Structures //! -//! There are three user-facing data structures: +//! There are two concepts expressed as associated types in the `Types` trait: +//! - `TenantManager`: the thing that performs the expensive work. It produces +//! a `Timeline` object, which is the other associated type. +//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup. +//! +//! There are three user-facing data structures exposed by this module: //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime. //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime. -//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`. +//! - `Handle`: a smart pointer that derefs to the Types::Timeline. //! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows -//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*. +//! trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always +//! point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`. //! //! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`. //! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`. @@ -64,11 +68,14 @@ //! //! To dispatch a request, the page service connection calls `Cache::get`. //! -//! A cache miss means we consult the tenant manager for shard routing, -//! resulting in an `Arc`. We enter its gate _once_ and store it in the the -//! `Arc>>`. A weak ref is stored in the `Cache` +//! A cache miss means we call Types::TenantManager::resolve for shard routing, +//! cloning the `Arc` out of it, and entering the gate. The result of +//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls. +//! +//! We wrap the object returned from resolve() in an `Arc` and store that inside the +//! `Arc>>`. A weak ref to the HandleInner is stored in the `Cache` //! and a strong ref in the `PerTimelineState`. -//! A strong ref is returned wrapped in a `Handle`. +//! Another strong ref is returned wrapped in a `Handle`. //! //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing //! and find the weak ref in the cache. @@ -78,51 +85,51 @@ //! While a request is batching, the `Handle` is downgraded to a `WeakHandle`. //! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle` //! and the request handler dispatches the request to the right `>::$request_method`. -//! It then drops the `Handle`, which drops the `Arc`. +//! It then drops the `Handle`, and thus the `Arc>` inside it. //! //! # Performance //! //! Remember from the introductory section: //! -//! > However, we want to avoid the overhead of entering the gate for every -//! > method invocation. +//! > We want to avoid the overhead of doing, for each incoming request, +//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing) +//! > - cloning the `Arc` out of the tenant manager so we can +//! > release the mgr rwlock before doing any request processing work +//! > - re-entering the Timeline gate for each Timeline method invocation. //! -//! Why do we want to avoid that? -//! Because the gate is a shared location in memory and entering it involves -//! bumping refcounts, which leads to cache contention if done frequently -//! from multiple cores in parallel. +//! All of these boil down to some state that is either globally shared among all shards +//! or state shared among all tasks that serve a particular timeline. +//! It is either protected by RwLock or manipulated via atomics. +//! Even atomics are costly when shared across multiple cores. +//! So, we want to avoid any permanent need for coordination between page_service tasks. //! -//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`. -//! That `Arc` is private to the `HandleInner` and hence to the connection. +//! The solution is to add indirection: we wrap the Types::Timeline object that is +//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner` +//! and hence to the single Cache / page_service connection. //! (Review the "Data Structures" section if that is unclear to you.) //! -//! A `WeakHandle` is a weak ref to the `HandleInner`. -//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and -//! further acquire an additional strong ref to the `Arc` inside it. -//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection. //! -//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc`. -//! Again, this is cheap because the `Arc` is private to the connection. +//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex`), +//! lock the mutex, take out a clone of the `Arc`, and drop the Mutex. +//! The Mutex is not contended because it is private to the connection. +//! And again, the `Arc` clone is cheap because that wrapper +//! Arc's refcounts are private to the connection. +//! +//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection. //! -//! In addition to the GateGuard, we need to provide `Deref` impl. -//! For this, both `Handle` need infallible access to an `Arc`. -//! We could clone the `Arc` when upgrading a `WeakHandle`, but that would cause contention -//! on the shared memory location that trakcs the refcount of the `Arc`. -//! Instead, we wrap the `Arc` into another `Arc`. -//! so that we can clone it cheaply when upgrading a `WeakHandle`. //! //! # Shutdown //! //! The attentive reader may have noticed the following reference cycle around the `Arc`: //! //! ```text -//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline //! ``` //! //! Further, there is this cycle: //! //! ```text -//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline //! ``` //! //! The former cycle is a memory leak if not broken. @@ -135,9 +142,12 @@ //! - Timeline shutdown (=> `PerTimelineState::shutdown`) //! - Connection shutdown (=> dropping the `Cache`). //! -//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to -//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the -//! `Arc`. +//! Both transition the `HandleInner` from [`HandleInner::Open`] to +//! [`HandleInner::ShutDown`], which drops the only long-lived +//! `Arc`. Once the last short-lived Arc +//! is dropped, the `Types::Timeline` gets dropped and thereby +//! the `GateGuard` and the `Arc` that it stores, +//! thereby breaking both cycles. //! //! `PerTimelineState::shutdown` drops all the `HandleInners` it contains, //! thereby breaking the cycle. @@ -216,7 +226,7 @@ use crate::tenant::mgr::ShardSelector; pub(crate) trait Types: Sized + std::fmt::Debug { type TenantManagerError: Sized + std::fmt::Debug; type TenantManager: TenantManager + Sized; - type Timeline: ArcTimeline + Sized; + type Timeline: Timeline + Sized; } /// Uniquely identifies a [`Cache`] instance over the lifetime of the process. @@ -261,20 +271,15 @@ pub(crate) struct ShardTimelineId { /// See module-level comment. pub(crate) struct Handle { - timeline: Arc, - #[allow(dead_code)] // the field exists to keep the gate open - gate_guard: Arc, inner: Arc>>, + open: Arc, } pub(crate) struct WeakHandle { inner: Weak>>, } + enum HandleInner { - KeepingTimelineGateOpen { - #[allow(dead_code)] - gate_guard: Arc, - timeline: Arc, - }, + Open(Arc), ShutDown, } @@ -307,8 +312,7 @@ pub(crate) trait TenantManager { } /// Abstract view of an [`Arc`], for testability. -pub(crate) trait ArcTimeline: Clone { - fn gate(&self) -> &utils::sync::gate::Gate; +pub(crate) trait Timeline { fn shard_timeline_id(&self) -> ShardTimelineId; fn get_shard_identity(&self) -> &ShardIdentity; fn per_timeline_state(&self) -> &PerTimelineState; @@ -318,7 +322,6 @@ pub(crate) trait ArcTimeline: Clone { #[derive(Debug)] pub(crate) enum GetError { TenantManager(T::TenantManagerError), - TimelineGateClosed, PerTimelineStateShutDown, } @@ -434,21 +437,9 @@ impl Cache { } trace!("creating new HandleInner"); - let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen { - gate_guard: Arc::new( - // this enter() is expensive in production code because - // it hits the global Arc::gate refcounts - match timeline.gate().enter() { - Ok(guard) => guard, - Err(_) => { - return Err(GetError::TimelineGateClosed); - } - }, - ), - // this clone is expensive in production code because - // it hits the global Arc::clone refcounts - timeline: Arc::new(timeline.clone()), - })); + let timeline = Arc::new(timeline); + let handle_inner_arc = + Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline)))); let handle_weak = WeakHandle { inner: Arc::downgrade(&handle_inner_arc), }; @@ -503,18 +494,10 @@ impl WeakHandle { }; let lock_guard = inner.lock().expect("poisoned"); match &*lock_guard { - HandleInner::KeepingTimelineGateOpen { - timeline, - gate_guard, - } => { - let gate_guard = Arc::clone(gate_guard); - let timeline = Arc::clone(timeline); + HandleInner::Open(open) => { + let open = Arc::clone(open); drop(lock_guard); - Ok(Handle { - timeline, - gate_guard, - inner, - }) + Ok(Handle { open, inner }) } HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown), } @@ -528,7 +511,7 @@ impl WeakHandle { impl std::ops::Deref for Handle { type Target = T::Timeline; fn deref(&self) -> &Self::Target { - &self.timeline + &self.open } } @@ -545,7 +528,7 @@ impl PerTimelineState { /// to the [`Types::Timeline`] that embeds this per-timeline state. /// Even if [`TenantManager::resolve`] would still resolve to it. /// - /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive. + /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive. /// That's ok because they're short-lived. See module-level comment for details. #[instrument(level = "trace", skip_all)] pub(super) fn shutdown(&self) { @@ -611,7 +594,7 @@ impl Drop for Cache { impl HandleInner { fn shutdown(&mut self) -> Option> { match std::mem::replace(self, HandleInner::ShutDown) { - HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline), + HandleInner::Open(timeline) => Some(timeline), HandleInner::ShutDown => { // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown // may do it concurrently, but locking rules disallow holding per-timeline-state lock and @@ -631,6 +614,7 @@ mod tests { use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardStripeSize; use utils::shard::ShardCount; + use utils::sync::gate::GateGuard; use super::*; @@ -641,7 +625,7 @@ mod tests { impl Types for TestTypes { type TenantManagerError = anyhow::Error; type TenantManager = StubManager; - type Timeline = Arc; + type Timeline = Entered; } struct StubManager { @@ -656,17 +640,19 @@ mod tests { myself: Weak, } + struct Entered { + timeline: Arc, + #[allow(dead_code)] // it's stored here to keep the gate open + gate_guard: Arc, + } + impl StubTimeline { fn getpage(&self) { // do nothing } } - impl ArcTimeline for Arc { - fn gate(&self) -> &utils::sync::gate::Gate { - &self.gate - } - + impl Timeline for Entered { fn shard_timeline_id(&self) -> ShardTimelineId { ShardTimelineId { shard_index: self.shard.shard_index(), @@ -688,20 +674,34 @@ mod tests { &self, timeline_id: TimelineId, shard_selector: ShardSelector, - ) -> anyhow::Result> { + ) -> anyhow::Result { for timeline in &self.shards { if timeline.id == timeline_id { + let enter_gate = || { + let gate_guard = timeline.gate.enter()?; + let gate_guard = Arc::new(gate_guard); + anyhow::Ok(gate_guard) + }; match &shard_selector { ShardSelector::Zero if timeline.shard.is_shard_zero() => { - return Ok(Arc::clone(timeline)); + return Ok(Entered { + timeline: Arc::clone(timeline), + gate_guard: enter_gate()?, + }); } ShardSelector::Zero => continue, ShardSelector::Page(key) if timeline.shard.is_key_local(key) => { - return Ok(Arc::clone(timeline)); + return Ok(Entered { + timeline: Arc::clone(timeline), + gate_guard: enter_gate()?, + }); } ShardSelector::Page(_) => continue, ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => { - return Ok(Arc::clone(timeline)); + return Ok(Entered { + timeline: Arc::clone(timeline), + gate_guard: enter_gate()?, + }); } ShardSelector::Known(_) => continue, } @@ -711,6 +711,13 @@ mod tests { } } + impl std::ops::Deref for Entered { + type Target = StubTimeline; + fn deref(&self) -> &Self::Target { + &self.timeline + } + } + #[tokio::test(start_paused = true)] async fn test_timeline_shutdown() { crate::tenant::harness::setup_logging(); @@ -1038,7 +1045,6 @@ mod tests { let key = DBDIR_KEY; // Simulate 10 connections that's opened, used, and closed - let mut used_handles = vec![]; for _ in 0..10 { let mut cache = Cache::::default(); let handle = { @@ -1050,7 +1056,6 @@ mod tests { handle }; handle.getpage(); - used_handles.push(Arc::downgrade(&handle.timeline)); } // No handles exist, thus gates are closed and don't require shutdown. diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs index 184c830464..11df232a10 100644 --- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs +++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs @@ -32,6 +32,7 @@ impl HeatmapLayersDownloader { fn new( timeline: Arc, concurrency: usize, + recurse: bool, ctx: RequestContext, ) -> Result { let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?; @@ -60,11 +61,11 @@ impl HeatmapLayersDownloader { tracing::info!( resident_size=%timeline.resident_physical_size(), - heatmap_layers=%heatmap.layers.len(), + heatmap_layers=%heatmap.all_layers().count(), "Starting heatmap layers download" ); - let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map( + let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map( |layer| { let ctx = ctx.attached_child(); let tl = timeline.clone(); @@ -98,6 +99,20 @@ impl HeatmapLayersDownloader { }, _ = cancel.cancelled() => { tracing::info!("Heatmap layers download cancelled"); + return; + } + } + + if recurse { + if let Some(ancestor) = timeline.ancestor_timeline() { + let ctx = ctx.attached_child(); + let res = + ancestor.start_heatmap_layers_download(concurrency, recurse, &ctx); + if let Err(err) = res { + tracing::info!( + "Failed to start heatmap layers download for ancestor: {err}" + ); + } } } } @@ -140,14 +155,20 @@ impl HeatmapLayersDownloader { } impl Timeline { - pub(crate) async fn start_heatmap_layers_download( + pub(crate) fn start_heatmap_layers_download( self: &Arc, concurrency: usize, + recurse: bool, ctx: &RequestContext, ) -> Result<(), ApiError> { let mut locked = self.heatmap_layers_downloader.lock().unwrap(); if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) { - let dl = HeatmapLayersDownloader::new(self.clone(), concurrency, ctx.attached_child())?; + let dl = HeatmapLayersDownloader::new( + self.clone(), + concurrency, + recurse, + ctx.attached_child(), + )?; *locked = Some(dl); Ok(()) } else { diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index e552ea83de..1b489028dc 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -8,14 +8,14 @@ use tracing::trace; use utils::id::TimelineId; use utils::lsn::{AtomicLsn, Lsn}; -use super::TimelineWriterState; +use super::{ReadableLayer, TimelineWriterState}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::metrics::TimelineMetrics; use crate::tenant::layer_map::{BatchedUpdates, LayerMap}; use crate::tenant::storage_layer::{ AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, - PersistentLayerKey, ResidentLayer, + PersistentLayerKey, ReadableLayerWeak, ResidentLayer, }; /// Provides semantic APIs to manipulate the layer map. @@ -37,6 +37,21 @@ impl Default for LayerManager { } impl LayerManager { + pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer { + match weak { + ReadableLayerWeak::PersistentLayer(desc) => { + ReadableLayer::PersistentLayer(self.get_from_desc(&desc)) + } + ReadableLayerWeak::InMemoryLayer(desc) => { + let inmem = self + .layer_map() + .expect("no concurrent shutdown") + .in_memory_layer(&desc); + ReadableLayer::InMemoryLayer(inmem) + } + } + } + pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer { // The assumption for the `expect()` is that all code maintains the following invariant: // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. @@ -470,6 +485,25 @@ impl OpenLayerManager { mapping.remove(layer); layer.delete_on_drop(); } + + #[cfg(test)] + pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc) { + use pageserver_api::models::InMemoryLayerInfo; + + match layer.info() { + InMemoryLayerInfo::Open { .. } => { + assert!(self.layer_map.open_layer.is_none()); + self.layer_map.open_layer = Some(layer); + } + InMemoryLayerInfo::Frozen { lsn_start, .. } => { + if let Some(last) = self.layer_map.frozen_layers.back() { + assert!(last.get_lsn_range().end <= lsn_start); + } + + self.layer_map.frozen_layers.push_back(layer); + } + } + } } pub(crate) struct LayerFileManager(HashMap); diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index f6a577abfc..9f0a877b07 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -1369,6 +1369,10 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) if (lfc_ctl) value = lfc_ctl->limit; break; + case 8: + key = "file_cache_chunk_size_pages"; + value = BLOCKS_PER_CHUNK; + break; default: SRF_RETURN_DONE(funcctx); } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index fe463fd4a6..0414661a5f 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1026,6 +1026,19 @@ prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, n if (!neon_prefetch_response_usable(&lsns[i], slot)) continue; + /* + * Ignore errors + */ + if (slot->response->tag != T_NeonGetPageResponse) + { + if (slot->response->tag != T_NeonErrorResponse) + { + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag); + } + continue; + } memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); prefetch_set_unused(ring_index); BITMAP_SET(mask, i); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 356895aa82..7ec4ec99fc 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -83,6 +83,7 @@ static void AssertEventsOkForState(uint32 events, Safekeeper *sk); static char *FormatEvents(WalProposer *wp, uint32 events); static void UpdateDonorShmem(WalProposer *wp); static char *MembershipConfigurationToString(MembershipConfiguration *mconf); +static void MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst); static void MembershipConfigurationFree(MembershipConfiguration *mconf); WalProposer * @@ -97,7 +98,32 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->config = config; wp->api = api; - for (host = wp->config->safekeepers_list; host != NULL && *host != '\0'; host = sep) + wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list); + + /* + * If safekeepers list starts with g# parse generation number followed by + * : + */ + if (strncmp(wp->config->safekeepers_list, "g#", 2) == 0) + { + char *endptr; + + errno = 0; + wp->safekeepers_generation = strtoul(wp->config->safekeepers_list + 2, &endptr, 10); + if (errno != 0) + { + wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m"); + } + /* Skip past : to the first hostname. */ + host = endptr + 1; + } + else + { + host = wp->config->safekeepers_list; + } + wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation); + + for (; host != NULL && *host != '\0'; host = sep) { port = strchr(host, ':'); if (port == NULL) @@ -183,6 +209,12 @@ WalProposerFree(WalProposer *wp) pfree(wp); } +static bool +WalProposerGenerationsEnabled(WalProposer *wp) +{ + return wp->safekeepers_generation != 0; +} + /* * Create new AppendRequest message and start sending it. This function is * called from walsender every time the new WAL is available. @@ -600,10 +632,14 @@ static void SendStartWALPush(Safekeeper *sk) { WalProposer *wp = sk->wp; + + /* Forbid implicit timeline creation if generations are enabled. */ + char *allow_timeline_creation = WalProposerGenerationsEnabled(wp) ? "false" : "true"; #define CMD_LEN 512 char cmd[CMD_LEN]; - snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version); + + snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d', allow_timeline_creation '%s')", wp->config->proto_version, allow_timeline_creation); if (!wp->api.conn_send_query(sk, cmd)) { wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s", @@ -705,6 +741,18 @@ RecvAcceptorGreeting(Safekeeper *sk) sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term); pfree(mconf_toml); + /* + * Adopt mconf of safekeepers if it is higher. TODO: mconf change should + * restart wp if it started voting. + */ + if (sk->greetResponse.mconf.generation > wp->mconf.generation) + { + MembershipConfigurationFree(&wp->mconf); + MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf); + /* full conf was just logged above */ + wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation); + } + /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; @@ -1896,7 +1944,8 @@ PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf pq_sendint64_le(buf, m->termHistory->entries[i].term); pq_sendint64_le(buf, m->termHistory->entries[i].lsn); } - /* + + /* * Removed timeline_start_lsn. Still send it as a valid * value until safekeepers taking it from term history are * deployed. @@ -2162,7 +2211,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) } } wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version); - return false; /* keep the compiler quiet */ + return false; /* keep the compiler quiet */ } /* @@ -2570,6 +2619,18 @@ MembershipConfigurationToString(MembershipConfiguration *mconf) return s.data; } +static void +MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst) +{ + dst->generation = src->generation; + dst->members.len = src->members.len; + dst->members.m = palloc0(sizeof(SafekeeperId) * dst->members.len); + memcpy(dst->members.m, src->members.m, sizeof(SafekeeperId) * dst->members.len); + dst->new_members.len = src->new_members.len; + dst->new_members.m = palloc0(sizeof(SafekeeperId) * dst->new_members.len); + memcpy(dst->new_members.m, src->new_members.m, sizeof(SafekeeperId) * dst->new_members.len); +} + static void MembershipConfigurationFree(MembershipConfiguration *mconf) { diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index eee55f924f..8d1ae26cac 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -160,7 +160,10 @@ typedef struct MemberSet SafekeeperId *m; /* ids themselves */ } MemberSet; -/* Timeline safekeeper membership configuration. */ +/* + * Timeline safekeeper membership configuration as sent in the + * protocol. + */ typedef struct MembershipConfiguration { Generation generation; @@ -761,8 +764,22 @@ typedef struct WalProposer /* (n_safekeepers / 2) + 1 */ int quorum; + /* + * Generation of the membership conf of which safekeepers[] are presumably + * members. To make cplane life a bit easier and have more control in + * tests with which sks walproposer gets connected neon.safekeepers GUC + * doesn't provide full mconf, only the list of endpoints to connect to. + * We still would like to know generation associated with it because 1) we + * need some handle to enforce using generations in walproposer, and + * non-zero value of this serves the purpose; 2) currently we don't do + * that, but in theory walproposer can update list of safekeepers to + * connect to upon receiving mconf from safekeepers, and generation number + * must be checked to see which list is newer. + */ + Generation safekeepers_generation; /* Number of occupied slots in safekeepers[] */ int n_safekeepers; + /* Safekeepers walproposer is connecting to. */ Safekeeper safekeeper[MAX_SAFEKEEPERS]; /* WAL has been generated up to this point */ diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index ff2846a9e7..75b9ab4464 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -32,8 +32,8 @@ #include "inmem_smgr.h" -/* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, but we can update up to 3 forks for each block */ -#define MAX_PAGES 100 +/* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, so assume that 64 will be enough */ +#define MAX_PAGES 64 /* If more than WARN_PAGES are used, print a warning in the log */ #define WARN_PAGES 32 @@ -174,10 +174,7 @@ static void inmem_zeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync) { - char buffer[BLCKSZ] = {0}; - - for (int i = 0; i < nblocks; i++) - inmem_extend(reln, forknum, blocknum + i, buffer, skipFsync); + /* Do nothing: inmem_read will return zero page in any case */ } #endif diff --git a/poetry.lock b/poetry.lock index ba3b0535e4..03aa543b06 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1414,14 +1414,14 @@ files = [ [[package]] name = "jinja2" -version = "3.1.5" +version = "3.1.6" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, - {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, + {file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"}, + {file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"}, ] [package.dependencies] @@ -3820,4 +3820,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "9711c5479c867fa614ce3d352f1bbc63dba1cb2376d347f96fbeda6f512ee308" +content-hash = "010ffce959bb256880ab5a267048c182e4612b3151f9a94e3bf5d3a7807962fe" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5964b76ecf..b6e3f03a81 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -53,7 +53,7 @@ measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true opentelemetry = { workspace = true, features = ["trace"] } -papaya = "0.1.8" +papaya = "0.2.0" parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 9c3a3772cd..7a6dceb194 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -35,6 +35,7 @@ impl LocalBackend { endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"), project_id: ProjectIdTag::get_interner().get_or_intern("local"), branch_id: BranchIdTag::get_interner().get_or_intern("local"), + compute_id: "local".into(), cold_start_info: ColdStartInfo::WarmCached, }, }, diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 5447a4a4c0..dfa6015b10 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,3 +1,4 @@ +use std::fmt::Debug; use std::io; use std::net::SocketAddr; use std::time::Duration; @@ -10,7 +11,7 @@ use postgres_protocol::message::backend::NoticeResponseBody; use pq_proto::StartupMessageParams; use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; -use tokio::net::TcpStream; +use tokio::net::{TcpStream, lookup_host}; use tracing::{debug, error, info, warn}; use crate::auth::backend::ComputeUserInfo; @@ -180,21 +181,19 @@ impl ConnCfg { use postgres_client::config::Host; // wrap TcpStream::connect with timeout - let connect_with_timeout = |host, port| { - tokio::time::timeout(timeout, TcpStream::connect((host, port))).map( - move |res| match res { - Ok(tcpstream_connect_res) => tcpstream_connect_res, - Err(_) => Err(io::Error::new( - io::ErrorKind::TimedOut, - format!("exceeded connection timeout {timeout:?}"), - )), - }, - ) + let connect_with_timeout = |addrs| { + tokio::time::timeout(timeout, TcpStream::connect(addrs)).map(move |res| match res { + Ok(tcpstream_connect_res) => tcpstream_connect_res, + Err(_) => Err(io::Error::new( + io::ErrorKind::TimedOut, + format!("exceeded connection timeout {timeout:?}"), + )), + }) }; - let connect_once = |host, port| { - debug!("trying to connect to compute node at {host}:{port}"); - connect_with_timeout(host, port).and_then(|stream| async { + let connect_once = |addrs| { + debug!("trying to connect to compute node at {addrs:?}"); + connect_with_timeout(addrs).and_then(|stream| async { let socket_addr = stream.peer_addr()?; let socket = socket2::SockRef::from(&stream); // Disable Nagle's algorithm to not introduce latency between @@ -216,7 +215,12 @@ impl ConnCfg { Host::Tcp(host) => host.as_str(), }; - match connect_once(host, port).await { + let addrs = match self.0.get_host_addr() { + Some(addr) => vec![SocketAddr::new(addr, port)], + None => lookup_host((host, port)).await?.collect(), + }; + + match connect_once(&*addrs).await { Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)), Err(err) => { warn!("couldn't connect to compute node at {host}:{port}: {err}"); @@ -277,13 +281,15 @@ impl ConnCfg { } = connection; tracing::Span::current().record("pid", tracing::field::display(process_id)); + tracing::Span::current().record("compute_id", tracing::field::display(&aux.compute_id)); let stream = stream.into_inner(); // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?) info!( cold_start_info = ctx.cold_start_info().as_str(), - "connected to compute node at {host} ({socket_addr}) sslmode={:?}", - self.0.get_ssl_mode() + "connected to compute node at {host} ({socket_addr}) sslmode={:?}, latency={}", + self.0.get_ssl_mode(), + ctx.get_proxy_latency(), ); // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index f87f4e9ef8..e10a04b4f1 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -17,7 +17,8 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::error::ErrorKind; use crate::intern::{BranchIdInt, ProjectIdInt}; use crate::metrics::{ - ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting, + ConnectOutcome, InvalidEndpointsGroup, LatencyAccumulated, LatencyTimer, Metrics, Protocol, + Waiting, }; use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra}; use crate::types::{DbName, EndpointId, RoleName}; @@ -346,6 +347,14 @@ impl RequestContext { } } + pub(crate) fn get_proxy_latency(&self) -> LatencyAccumulated { + self.0 + .try_lock() + .expect("should not deadlock") + .latency_timer + .accumulated() + } + pub(crate) fn success(&self) { self.0 .try_lock() diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index 977fcf4727..2765aaa462 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -1,5 +1,7 @@ //! Production console backend. +use std::net::IpAddr; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -274,11 +276,27 @@ impl NeonControlPlaneClient { Some(x) => x, }; + let host_addr = IpAddr::from_str(host).ok(); + + let ssl_mode = match &body.server_name { + Some(_) => SslMode::Require, + None => SslMode::Disable, + }; + let host_name = match body.server_name { + Some(host) => host, + None => host.to_owned(), + }; + // Don't set anything but host and port! This config will be cached. // We'll set username and such later using the startup message. // TODO: add more type safety (in progress). - let mut config = compute::ConnCfg::new(host.to_owned(), port); - config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. + let mut config = compute::ConnCfg::new(host_name, port); + + if let Some(addr) = host_addr { + config.set_host_addr(addr); + } + + config.ssl_mode(ssl_mode); let node = NodeInfo { config, diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index 7da5464aa5..ee722e839e 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -1,5 +1,6 @@ //! Mock console backend which relies on a user-provided postgres instance. +use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; @@ -167,10 +168,22 @@ impl MockControlPlane { } async fn do_wake_compute(&self) -> Result { - let mut config = compute::ConnCfg::new( - self.endpoint.host_str().unwrap_or("localhost").to_owned(), - self.endpoint.port().unwrap_or(5432), - ); + let port = self.endpoint.port().unwrap_or(5432); + let mut config = match self.endpoint.host_str() { + None => { + let mut config = compute::ConnCfg::new("localhost".to_string(), port); + config.set_host_addr(IpAddr::V4(Ipv4Addr::LOCALHOST)); + config + } + Some(host) => { + let mut config = compute::ConnCfg::new(host.to_string(), port); + if let Ok(addr) = IpAddr::from_str(host) { + config.set_host_addr(addr); + } + config + } + }; + config.ssl_mode(postgres_client::config::SslMode::Disable); let node = NodeInfo { @@ -179,6 +192,7 @@ impl MockControlPlane { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), + compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, }; diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 8d6b2e96f5..ec4554eab5 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -2,6 +2,7 @@ use std::fmt::{self, Display}; use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; +use smol_str::SmolStr; use crate::auth::IpPattern; use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; @@ -239,6 +240,7 @@ pub(crate) struct GetEndpointAccessControl { #[derive(Debug, Deserialize)] pub(crate) struct WakeCompute { pub(crate) address: Box, + pub(crate) server_name: Option, pub(crate) aux: MetricsAuxInfo, } @@ -312,6 +314,9 @@ pub(crate) struct MetricsAuxInfo { pub(crate) endpoint_id: EndpointIdInt, pub(crate) project_id: ProjectIdInt, pub(crate) branch_id: BranchIdInt, + // note: we don't use interned strings for compute IDs. + // they churn too quickly and we have no way to clean up interned strings. + pub(crate) compute_id: SmolStr, #[serde(default)] pub(crate) cold_start_info: ColdStartInfo, } @@ -378,6 +383,7 @@ mod tests { "endpoint_id": "endpoint", "project_id": "project", "branch_id": "branch", + "compute_id": "compute", "cold_start_info": "unknown", }) } diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 3c34918d84..6f9845fd6e 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,9 +1,11 @@ use std::cell::{Cell, RefCell}; use std::collections::HashMap; use std::hash::BuildHasher; -use std::{env, io}; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::{array, env, fmt, io}; use chrono::{DateTime, Utc}; +use indexmap::IndexSet; use opentelemetry::trace::TraceContextExt; use scopeguard::defer; use serde::ser::{SerializeMap, Serializer}; @@ -17,6 +19,7 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields}; use tracing_subscriber::layer::{Context, Layer}; use tracing_subscriber::prelude::*; use tracing_subscriber::registry::{LookupSpan, SpanRef}; +use try_lock::TryLock; /// Initialize logging and OpenTelemetry tracing and exporter. /// @@ -46,13 +49,13 @@ pub async fn init() -> anyhow::Result { let otlp_layer = tracing_utils::init_tracing("proxy").await; let json_log_layer = if logfmt == LogFormat::Json { - Some(JsonLoggingLayer { - clock: RealClock, - skipped_field_indices: papaya::HashMap::default(), - writer: StderrWriter { + Some(JsonLoggingLayer::new( + RealClock, + StderrWriter { stderr: std::io::stderr(), }, - }) + ["request_id", "session_id", "conn_id"], + )) } else { None }; @@ -191,13 +194,39 @@ thread_local! { } /// Implements tracing layer to handle events specific to logging. -struct JsonLoggingLayer { +struct JsonLoggingLayer { clock: C, skipped_field_indices: papaya::HashMap, + callsite_ids: papaya::HashMap, writer: W, + // We use a const generic and arrays to bypass one heap allocation. + extract_fields: IndexSet<&'static str>, + _marker: std::marker::PhantomData<[&'static str; F]>, } -impl Layer for JsonLoggingLayer +impl JsonLoggingLayer { + fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self { + JsonLoggingLayer { + clock, + skipped_field_indices: papaya::HashMap::default(), + callsite_ids: papaya::HashMap::default(), + writer, + extract_fields: IndexSet::from_iter(extract_fields), + _marker: std::marker::PhantomData, + } + } + + #[inline] + fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId { + *self + .callsite_ids + .pin() + .get_or_insert_with(cs, CallsiteId::next) + } +} + +impl Layer + for JsonLoggingLayer where S: Subscriber + for<'a> LookupSpan<'a>, { @@ -211,7 +240,14 @@ where let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| { if entered.get() { let mut formatter = EventFormatter::new(); - formatter.format(now, event, &ctx, &self.skipped_field_indices)?; + formatter.format::( + now, + event, + &ctx, + &self.skipped_field_indices, + &self.callsite_ids, + &self.extract_fields, + )?; self.writer.make_writer().write_all(formatter.buffer()) } else { entered.set(true); @@ -219,7 +255,14 @@ where EVENT_FORMATTER.with_borrow_mut(move |formatter| { formatter.reset(); - formatter.format(now, event, &ctx, &self.skipped_field_indices)?; + formatter.format::( + now, + event, + &ctx, + &self.skipped_field_indices, + &self.callsite_ids, + &self.extract_fields, + )?; self.writer.make_writer().write_all(formatter.buffer()) }) } @@ -246,10 +289,13 @@ where let span = ctx.span(id).expect("span must exist"); let fields = SpanFields::default(); fields.record_fields(attrs); + // This could deadlock when there's a panic somewhere in the tracing // event handling and a read or write guard is still held. This includes // the OTel subscriber. - span.extensions_mut().insert(fields); + let mut exts = span.extensions_mut(); + + exts.insert(fields); } fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) { @@ -265,6 +311,7 @@ where /// wins. fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest { if !metadata.is_event() { + self.callsite_id(metadata.callsite()); // Must not be never because we wouldn't get trace and span data. return Interest::always(); } @@ -297,6 +344,26 @@ where } } +#[derive(Copy, Clone, Debug, Default)] +#[repr(transparent)] +struct CallsiteId(u32); + +impl CallsiteId { + #[inline] + fn next() -> Self { + // Start at 1 to reserve 0 for default. + static COUNTER: AtomicU32 = AtomicU32::new(1); + CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed)) + } +} + +impl fmt::Display for CallsiteId { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + /// Stores span field values recorded during the spans lifetime. #[derive(Default)] struct SpanFields { @@ -448,12 +515,14 @@ impl EventFormatter { self.logline_buffer.clear(); } - fn format( + fn format( &mut self, now: DateTime, event: &Event<'_>, ctx: &Context<'_, S>, skipped_field_indices: &papaya::HashMap, + callsite_ids: &papaya::HashMap, + extract_fields: &IndexSet<&'static str>, ) -> io::Result<()> where S: Subscriber + for<'a> LookupSpan<'a>, @@ -485,6 +554,7 @@ impl EventFormatter { event.record(&mut message_extractor); let mut serializer = message_extractor.into_serializer()?; + // Direct message fields. let mut fields_present = FieldsPresent(false, skipped_field_indices); event.record(&mut fields_present); if fields_present.0 { @@ -494,7 +564,16 @@ impl EventFormatter { )?; } + let spans = SerializableSpans { + ctx, + callsite_ids, + extract: ExtractedSpanFields::<'_, F>::new(extract_fields), + }; + serializer.serialize_entry("spans", &spans)?; + + // TODO: thread-local cache? let pid = std::process::id(); + // Skip adding pid 1 to reduce noise for services running in containers. if pid != 1 { serializer.serialize_entry("process_id", &pid)?; } @@ -514,6 +593,7 @@ impl EventFormatter { serializer.serialize_entry("target", meta.target())?; + // Skip adding module if it's the same as target. if let Some(module) = meta.module_path() { if module != meta.target() { serializer.serialize_entry("module", module)?; @@ -540,7 +620,10 @@ impl EventFormatter { } } - serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?; + if spans.extract.has_values() { + // TODO: add fields from event, too? + serializer.serialize_entry("extract", &spans.extract)?; + } serializer.end() }; @@ -818,15 +901,20 @@ impl tracing::field::Visit for MessageFieldSkipper< } } -/// Serializes the span stack from root to leaf (parent of event) enumerated -/// inside an object where the keys are just the number padded with zeroes -/// to retain sorting order. -// The object is necessary because Loki cannot flatten arrays. -struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>) +/// Serializes the span stack from root to leaf (parent of event) as object +/// with the span names as keys. To prevent collision we append a numberic value +/// to the name. Also, collects any span fields we're interested in. Last one +/// wins. +struct SerializableSpans<'a, 'ctx, Span, const F: usize> where - Span: Subscriber + for<'lookup> LookupSpan<'lookup>; + Span: Subscriber + for<'lookup> LookupSpan<'lookup>, +{ + ctx: &'a Context<'ctx, Span>, + callsite_ids: &'a papaya::HashMap, + extract: ExtractedSpanFields<'a, F>, +} -impl serde::ser::Serialize for SerializableSpanStack<'_, '_, Span> +impl serde::ser::Serialize for SerializableSpans<'_, '_, Span, F> where Span: Subscriber + for<'lookup> LookupSpan<'lookup>, { @@ -836,9 +924,24 @@ where { let mut serializer = serializer.serialize_map(None)?; - if let Some(leaf_span) = self.0.lookup_current() { - for (i, span) in leaf_span.scope().from_root().enumerate() { - serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?; + if let Some(leaf_span) = self.ctx.lookup_current() { + for span in leaf_span.scope().from_root() { + // Append a numeric callsite ID to the span name to keep the name unique + // in the JSON object. + let cid = self + .callsite_ids + .pin() + .get(&span.metadata().callsite()) + .copied() + .unwrap_or_default(); + + // Loki turns the # into an underscore during field name concatenation. + serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?; + + serializer.serialize_value(&SerializableSpanFields { + span: &span, + extract: &self.extract, + })?; } } @@ -846,28 +949,79 @@ where } } -/// Serializes a single span. Include the span ID, name and its fields as -/// recorded up to this point. -struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>) -where - Span: for<'lookup> LookupSpan<'lookup>; - -impl serde::ser::Serialize for SerializableSpan<'_, '_, Span> +/// Serializes the span fields as object. +struct SerializableSpanFields<'a, 'span, Span, const F: usize> where Span: for<'lookup> LookupSpan<'lookup>, { - fn serialize(&self, serializer: Ser) -> Result + span: &'a SpanRef<'span, Span>, + extract: &'a ExtractedSpanFields<'a, F>, +} + +impl serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F> +where + Span: for<'lookup> LookupSpan<'lookup>, +{ + fn serialize(&self, serializer: S) -> Result where - Ser: serde::ser::Serializer, + S: serde::ser::Serializer, { let mut serializer = serializer.serialize_map(None)?; - // TODO: the span ID is probably only useful for debugging tracing. - serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?; - serializer.serialize_entry("span_name", self.0.metadata().name())?; - let ext = self.0.extensions(); + let ext = self.span.extensions(); if let Some(data) = ext.get::() { - for (key, value) in &data.fields.pin() { + for (name, value) in &data.fields.pin() { + serializer.serialize_entry(name, value)?; + // TODO: replace clone with reference, if possible. + self.extract.set(name, value.clone()); + } + } + + serializer.end() + } +} + +struct ExtractedSpanFields<'a, const F: usize> { + names: &'a IndexSet<&'static str>, + // TODO: replace TryLock with something local thread and interior mutability. + // serde API doesn't let us use `mut`. + values: TryLock<([Option; F], bool)>, +} + +impl<'a, const F: usize> ExtractedSpanFields<'a, F> { + fn new(names: &'a IndexSet<&'static str>) -> Self { + ExtractedSpanFields { + names, + values: TryLock::new((array::from_fn(|_| Option::default()), false)), + } + } + + #[inline] + fn set(&self, name: &'static str, value: serde_json::Value) { + if let Some((index, _)) = self.names.get_full(name) { + let mut fields = self.values.try_lock().expect("thread-local use"); + fields.0[index] = Some(value); + fields.1 = true; + } + } + + #[inline] + fn has_values(&self) -> bool { + self.values.try_lock().expect("thread-local use").1 + } +} + +impl serde::ser::Serialize for ExtractedSpanFields<'_, F> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::ser::Serializer, + { + let mut serializer = serializer.serialize_map(None)?; + + let values = self.values.try_lock().expect("thread-local use"); + for (i, value) in values.0.iter().enumerate() { + if let Some(value) = value { + let key = self.names[i]; serializer.serialize_entry(key, value)?; } } @@ -879,6 +1033,7 @@ where #[cfg(test)] #[allow(clippy::unwrap_used)] mod tests { + use std::marker::PhantomData; use std::sync::{Arc, Mutex, MutexGuard}; use assert_json_diff::assert_json_eq; @@ -927,14 +1082,17 @@ mod tests { let log_layer = JsonLoggingLayer { clock: clock.clone(), skipped_field_indices: papaya::HashMap::default(), + callsite_ids: papaya::HashMap::default(), writer: buffer.clone(), + extract_fields: IndexSet::from_iter(["x"]), + _marker: PhantomData::<[&'static str; 1]>, }; let registry = tracing_subscriber::Registry::default().with(log_layer); tracing::subscriber::with_default(registry, || { - info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| { - info_span!("span2").in_scope(|| { + info_span!("some_span", x = 24).in_scope(|| { + info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| { tracing::error!( a = 1, a = 2, @@ -960,16 +1118,16 @@ mod tests { "a": 3, }, "spans": { - "00":{ - "span_id": "0000000000000001", - "span_name": "span1", - "x": 42, + "some_span#1":{ + "x": 24, }, - "01": { - "span_id": "0000000000000002", - "span_name": "span2", + "some_span#2": { + "x": 42, } }, + "extract": { + "x": 42, + }, "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(), "target": "proxy::logging::tests", "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(), diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index db1f096de1..b6a2a059ea 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -394,21 +394,31 @@ pub enum RedisMsgKind { HDel, } -#[derive(Default)] -struct Accumulated { +#[derive(Default, Clone)] +pub struct LatencyAccumulated { cplane: time::Duration, client: time::Duration, compute: time::Duration, retry: time::Duration, } +impl std::fmt::Display for LatencyAccumulated { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "client: {:?}, cplane: {:?}, compute: {:?}, retry: {:?}", + self.client, self.cplane, self.compute, self.retry + ) + } +} + pub struct LatencyTimer { // time since the stopwatch was started start: time::Instant, // time since the stopwatch was stopped stop: Option, // accumulated time on the stopwatch - accumulated: Accumulated, + accumulated: LatencyAccumulated, // label data protocol: Protocol, cold_start_info: ColdStartInfo, @@ -422,7 +432,7 @@ impl LatencyTimer { Self { start: time::Instant::now(), stop: None, - accumulated: Accumulated::default(), + accumulated: LatencyAccumulated::default(), protocol, cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified @@ -435,7 +445,7 @@ impl LatencyTimer { Self { start: time::Instant::now(), stop: None, - accumulated: Accumulated::default(), + accumulated: LatencyAccumulated::default(), protocol, cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified @@ -465,6 +475,10 @@ impl LatencyTimer { // success self.outcome = ConnectOutcome::Success; } + + pub fn accumulated(&self) -> LatencyAccumulated { + self.accumulated.clone() + } } #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] @@ -511,7 +525,7 @@ impl Drop for LatencyTimer { duration.saturating_sub(accumulated_total).as_secs_f64(), ); - // Exclude client cplane, compue communication from the accumulated time. + // Exclude client, cplane, compute communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane + self.accumulated.compute; metric.observe( @@ -524,7 +538,7 @@ impl Drop for LatencyTimer { duration.saturating_sub(accumulated_total).as_secs_f64(), ); - // Exclude client cplane, compue, retry communication from the accumulated time. + // Exclude client, cplane, compute, retry communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane + self.accumulated.compute diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index b8b39fa121..e013fbbe2e 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -81,7 +81,10 @@ impl ConnectMechanism for TcpMechanism<'_> { type ConnectError = compute::ConnectionError; type Error = compute::ConnectionError; - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + pid = tracing::field::Empty, + compute_id = tracing::field::Empty + ))] async fn connect_once( &self, ctx: &RequestContext, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 171f539b1e..e0b7539538 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -555,6 +555,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), + compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, }; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 72029102e0..b55661cec8 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -1,4 +1,5 @@ use std::io; +use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; use std::time::Duration; @@ -6,11 +7,15 @@ use async_trait::async_trait; use ed25519_dalek::SigningKey; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; use jose_jwk::jose_b64; +use postgres_client::config::SslMode; use rand::rngs::OsRng; +use rustls::pki_types::{DnsName, ServerName}; use tokio::net::{TcpStream, lookup_host}; +use tokio_rustls::TlsConnector; use tracing::field::display; use tracing::{debug, info}; +use super::AsyncRW; use super::conn_pool::poll_client; use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool}; use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client}; @@ -190,7 +195,11 @@ impl PoolingBackend { // Wake up the destination if needed. Code here is a bit involved because // we reuse the code from the usual proxy and we need to prepare few structures // that this code expects. - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + pid = tracing::field::Empty, + compute_id = tracing::field::Empty, + conn_id = tracing::field::Empty, + ))] pub(crate) async fn connect_to_compute( &self, ctx: &RequestContext, @@ -229,7 +238,10 @@ impl PoolingBackend { } // Wake up the destination if needed - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + compute_id = tracing::field::Empty, + conn_id = tracing::field::Empty, + ))] pub(crate) async fn connect_to_local_proxy( &self, ctx: &RequestContext, @@ -276,7 +288,10 @@ impl PoolingBackend { /// # Panics /// /// Panics if called with a non-local_proxy backend. - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + pid = tracing::field::Empty, + conn_id = tracing::field::Empty, + ))] pub(crate) async fn connect_to_local_postgres( &self, ctx: &RequestContext, @@ -552,6 +567,10 @@ impl ConnectMechanism for TokioMechanism { let (client, connection) = permit.release_result(res)?; tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); + tracing::Span::current().record( + "compute_id", + tracing::field::display(&node_info.aux.compute_id), + ); Ok(poll_client( self.pool.clone(), ctx, @@ -587,16 +606,28 @@ impl ConnectMechanism for HyperMechanism { node_info: &CachedNodeInfo, config: &ComputeConfig, ) -> Result { + let host_addr = node_info.config.get_host_addr(); let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + let tls = if node_info.config.get_ssl_mode() == SslMode::Disable { + None + } else { + Some(&config.tls) + }; + let port = node_info.config.get_port(); - let res = connect_http2(&host, port, config.timeout).await; + let res = connect_http2(host_addr, &host, port, config.timeout, tls).await; drop(pause); let (client, connection) = permit.release_result(res)?; + tracing::Span::current().record( + "compute_id", + tracing::field::display(&node_info.aux.compute_id), + ); + Ok(poll_http2_client( self.pool.clone(), ctx, @@ -612,18 +643,22 @@ impl ConnectMechanism for HyperMechanism { } async fn connect_http2( + host_addr: Option, host: &str, port: u16, timeout: Duration, + tls: Option<&Arc>, ) -> Result<(http_conn_pool::Send, http_conn_pool::Connect), LocalProxyConnError> { - // assumption: host is an ip address so this should not actually perform any requests. - // todo: add that assumption as a guarantee in the control-plane API. - let mut addrs = lookup_host((host, port)) - .await - .map_err(LocalProxyConnError::Io)?; - + let addrs = match host_addr { + Some(addr) => vec![SocketAddr::new(addr, port)], + None => lookup_host((host, port)) + .await + .map_err(LocalProxyConnError::Io)? + .collect(), + }; let mut last_err = None; + let mut addrs = addrs.into_iter(); let stream = loop { let Some(addr) = addrs.next() else { return Err(last_err.unwrap_or_else(|| { @@ -651,6 +686,20 @@ async fn connect_http2( } }; + let stream = if let Some(tls) = tls { + let host = DnsName::try_from(host) + .map_err(io::Error::other) + .map_err(LocalProxyConnError::Io)? + .to_owned(); + let stream = TlsConnector::from(tls.clone()) + .connect(ServerName::DnsName(host), stream) + .await + .map_err(LocalProxyConnError::Io)?; + Box::pin(stream) as AsyncRW + } else { + Box::pin(stream) as AsyncRW + }; + let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new()) .timer(TokioTimer::new()) .keep_alive_interval(Duration::from_secs(20)) diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 6a9089fc2a..516d474a11 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -221,6 +221,7 @@ mod tests { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), + compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, conn_id: uuid::Uuid::new_v4(), diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 338a79b4b3..bca2d4c165 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -6,9 +6,9 @@ use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; use smol_str::ToSmolStr; -use tokio::net::TcpStream; use tracing::{Instrument, debug, error, info, info_span}; +use super::AsyncRW; use super::backend::HttpConnError; use super::conn_pool_lib::{ ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, ConnPoolEntry, @@ -22,8 +22,7 @@ use crate::types::EndpointCacheKey; use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; pub(crate) type Send = http2::SendRequest; -pub(crate) type Connect = - http2::Connection, hyper::body::Incoming, TokioExecutor>; +pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; #[derive(Clone)] pub(crate) struct ClientDataHttp(); diff --git a/proxy/src/tls/client_config.rs b/proxy/src/tls/client_config.rs index a2d695aae1..ce873e678e 100644 --- a/proxy/src/tls/client_config.rs +++ b/proxy/src/tls/client_config.rs @@ -1,17 +1,49 @@ +use std::env; +use std::io::Cursor; +use std::path::PathBuf; use std::sync::Arc; -use anyhow::bail; +use anyhow::{Context, bail}; use rustls::crypto::ring; -pub(crate) fn load_certs() -> anyhow::Result> { +/// We use an internal certificate authority when establishing a TLS connection with compute. +fn load_internal_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> { + let Some(ca_file) = env::var_os("NEON_INTERNAL_CA_FILE") else { + return Ok(()); + }; + let ca_file = PathBuf::from(ca_file); + + let ca = std::fs::read(&ca_file) + .with_context(|| format!("could not read CA from {}", ca_file.display()))?; + + for cert in rustls_pemfile::certs(&mut Cursor::new(&*ca)) { + store + .add(cert.context("could not parse internal CA certificate")?) + .context("could not parse internal CA certificate")?; + } + + Ok(()) +} + +/// For console redirect proxy, we need to establish a connection to compute via pg-sni-router. +/// pg-sni-router needs TLS and uses a Let's Encrypt signed certificate, so we +/// load certificates from our native store. +fn load_native_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> { let der_certs = rustls_native_certs::load_native_certs(); if !der_certs.errors.is_empty() { bail!("could not parse certificates: {:?}", der_certs.errors); } - let mut store = rustls::RootCertStore::empty(); store.add_parsable_certificates(der_certs.certs); + + Ok(()) +} + +fn load_compute_certs() -> anyhow::Result> { + let mut store = rustls::RootCertStore::empty(); + load_native_certs(&mut store)?; + load_internal_certs(&mut store)?; Ok(Arc::new(store)) } @@ -22,7 +54,7 @@ pub fn compute_client_config_with_root_certs() -> anyhow::Result { + critical!("failed to decode WAL record: {err:?}"); + } + err => error!("failed to read WAL record: {err}"), + }) } .instrument(info_span!("interpreted wal reader")), ); @@ -347,10 +363,12 @@ impl InterpretedWalReader { metric.dec(); } - if let Err(err) = self.run_impl(start_pos).await { - critical!("failed to read WAL record: {err:?}"); - } else { - info!("interpreted wal reader exiting"); + match self.run_impl(start_pos).await { + Err(err @ InterpretedWalReaderError::Decode(_)) => { + critical!("failed to decode WAL record: {err:?}"); + } + Err(err) => error!("failed to read WAL record: {err}"), + Ok(()) => info!("interpreted wal reader exiting"), } Err(CopyStreamHandlerEnd::Other(anyhow!( @@ -412,7 +430,10 @@ impl InterpretedWalReader { .with_context(|| "Failed to interpret WAL")?; for (shard, record) in interpreted { - if record.is_empty() { + // Shard zero needs to track the start LSN of the latest record + // in adition to the LSN of the next record to ingest. The former + // is included in basebackup persisted by the compute in WAL. + if !shard.is_shard_zero() && record.is_empty() { continue; } @@ -722,7 +743,7 @@ mod tests { .unwrap(); let resident_tli = tli.wal_residence_guard().await.unwrap(); - let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, c"neon-file:", None) .await .unwrap(); let end_pos = end_watch.get(); @@ -865,10 +886,16 @@ mod tests { let resident_tli = tli.wal_residence_guard().await.unwrap(); let mut next_record_lsns = Vec::default(); - let end_watch = - Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns)) - .await - .unwrap(); + let end_watch = Env::write_wal( + tli, + start_lsn, + SIZE, + MSG_COUNT, + c"neon-file:", + Some(&mut next_record_lsns), + ) + .await + .unwrap(); let end_pos = end_watch.get(); let streaming_wal_reader = StreamingWalReader::new( @@ -1009,10 +1036,16 @@ mod tests { .unwrap(); let resident_tli = tli.wal_residence_guard().await.unwrap(); - let end_watch = - Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns)) - .await - .unwrap(); + let end_watch = Env::write_wal( + tli, + start_lsn, + SIZE, + MSG_COUNT, + c"neon-file:", + Some(&mut next_record_lsns), + ) + .await + .unwrap(); assert!(next_record_lsns.len() > 3); let shard_0_start_lsn = next_record_lsns[3]; @@ -1106,4 +1139,88 @@ mod tests { } } } + + #[tokio::test] + async fn test_shard_zero_does_not_skip_empty_records() { + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 8 * 1024; + const MSG_COUNT: usize = 10; + const PG_VERSION: u32 = 17; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let mut next_record_lsns = Vec::new(); + let end_watch = Env::write_wal( + tli, + start_lsn, + SIZE, + MSG_COUNT, + // This is a logical message prefix that is not persisted to key value storage. + // We use it in order to validate that shard zero receives emtpy interpreted records. + c"test:", + Some(&mut next_record_lsns), + ) + .await + .unwrap(); + let end_pos = end_watch.get(); + + let streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + start_lsn, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + let shard = ShardIdentity::unsharded(); + let (records_tx, mut records_rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + + let handle = InterpretedWalReader::spawn( + streaming_wal_reader, + start_lsn, + records_tx, + shard, + PG_VERSION, + &Some("pageserver".to_string()), + ); + + let mut interpreted_records = Vec::new(); + while let Some(batch) = records_rx.recv().await { + interpreted_records.push(batch.records); + if batch.wal_end_lsn == batch.available_wal_end_lsn { + break; + } + } + + let received_next_record_lsns = interpreted_records + .into_iter() + .flat_map(|b| b.records) + .map(|rec| rec.next_record_lsn) + .collect::>(); + + // By default this also includes the start LSN. Trim it since it shouldn't be received. + let next_record_lsns = next_record_lsns.into_iter().skip(1).collect::>(); + + assert_eq!(received_next_record_lsns, next_record_lsns); + + handle.abort(); + let mut done = false; + for _ in 0..5 { + if handle.current_position().is_none() { + done = true; + break; + } + tokio::time::sleep(Duration::from_millis(1)).await; + } + + assert!(done); + } } diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs index e6f74185c1..618e2b59d2 100644 --- a/safekeeper/src/test_utils.rs +++ b/safekeeper/src/test_utils.rs @@ -1,3 +1,4 @@ +use std::ffi::CStr; use std::sync::Arc; use camino_tempfile::Utf8TempDir; @@ -124,6 +125,7 @@ impl Env { start_lsn: Lsn, msg_size: usize, msg_count: usize, + prefix: &CStr, mut next_record_lsns: Option<&mut Vec>, ) -> anyhow::Result { let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE); @@ -133,7 +135,6 @@ impl Env { WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0)); - let prefix = c"neon-file:"; let prefixlen = prefix.to_bytes_with_nul().len(); assert!(msg_size >= prefixlen); let message = vec![0; msg_size - prefixlen]; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 930f66a207..d3c841ec09 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -415,6 +415,9 @@ impl From for ApiError { } } +/// We run remote deletion in a background task, this is how it sends its results back. +type RemoteDeletionReceiver = tokio::sync::watch::Receiver>>; + /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. /// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { @@ -446,6 +449,8 @@ pub struct Timeline { manager_ctl: ManagerCtl, conf: Arc, + remote_deletion: std::sync::Mutex>, + /// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding /// this gate, you must respect [`Timeline::cancel`] pub(crate) gate: Gate, @@ -494,6 +499,7 @@ impl Timeline { walreceivers, gate: Default::default(), cancel: CancellationToken::default(), + remote_deletion: std::sync::Mutex::new(None), manager_ctl: ManagerCtl::new(), conf, broker_active: AtomicBool::new(false), @@ -598,15 +604,95 @@ impl Timeline { shared_state.sk.close_wal_store(); if !only_local && self.conf.is_wal_backup_enabled() { - // Note: we concurrently delete remote storage data from multiple - // safekeepers. That's ok, s3 replies 200 if object doesn't exist and we - // do some retries anyway. - wal_backup::delete_timeline(&self.ttid).await?; + self.remote_delete().await?; } let dir_existed = delete_dir(&self.timeline_dir).await?; Ok(dir_existed) } + /// Delete timeline content from remote storage. If the returned future is dropped, + /// deletion will continue in the background. + /// + /// This function ordinarily spawns a task and stashes a result receiver into [`Self::remote_deletion`]. If + /// deletion is already happening, it may simply wait for an existing task's result. + /// + /// Note: we concurrently delete remote storage data from multiple + /// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we + /// do some retries anyway. + async fn remote_delete(&self) -> Result<()> { + // We will start a background task to do the deletion, so that it proceeds even if our + // API request is dropped. Future requests will see the existing deletion task and wait + // for it to complete. + let mut result_rx = { + let mut remote_deletion_state = self.remote_deletion.lock().unwrap(); + let result_rx = if let Some(result_rx) = remote_deletion_state.as_ref() { + if let Some(result) = result_rx.borrow().as_ref() { + if let Err(e) = result { + // A previous remote deletion failed: we will start a new one + tracing::error!("remote deletion failed, will retry ({e})"); + None + } else { + // A previous remote deletion call already succeeded + return Ok(()); + } + } else { + // Remote deletion is still in flight + Some(result_rx.clone()) + } + } else { + // Remote deletion was not attempted yet, start it now. + None + }; + + match result_rx { + Some(result_rx) => result_rx, + None => self.start_remote_delete(&mut remote_deletion_state), + } + }; + + // Wait for a result + let Ok(result) = result_rx.wait_for(|v| v.is_some()).await else { + // Unexpected: sender should always send a result before dropping the channel, even if it has an error + return Err(anyhow::anyhow!( + "remote deletion task future was dropped without sending a result" + )); + }; + + result + .as_ref() + .expect("We did a wait_for on this being Some above") + .as_ref() + .map(|_| ()) + .map_err(|e| anyhow::anyhow!("remote deletion failed: {e}")) + } + + /// Spawn background task to do remote deletion, return a receiver for its outcome + fn start_remote_delete( + &self, + guard: &mut std::sync::MutexGuard>, + ) -> RemoteDeletionReceiver { + tracing::info!("starting remote deletion"); + let (result_tx, result_rx) = tokio::sync::watch::channel(None); + let ttid = self.ttid; + tokio::task::spawn( + async move { + let r = wal_backup::delete_timeline(&ttid).await; + if let Err(e) = &r { + // Log error here in case nobody ever listens for our result (e.g. dropped API request) + tracing::error!("remote deletion failed: {e}"); + } + + // Ignore send results: it's legal for the Timeline to give up waiting for us. + let _ = result_tx.send(Some(r)); + } + .instrument(info_span!("remote_delete", timeline = %self.ttid)), + ); + + **guard = Some(result_rx.clone()); + + result_rx + } + /// Returns if timeline is cancelled. pub fn is_cancelled(&self) -> bool { self.cancel.is_cancelled() diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 6176e64698..56f4a2faf9 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -21,9 +21,9 @@ use tokio::sync::{OnceCell, watch}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::backoff; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; +use utils::{backoff, pausable_failpoint}; use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; use crate::timeline::WalResidentTimeline; @@ -564,6 +564,12 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { // We don't currently have http requests timeout cancellation, but if/once // we have listing should get streaming interface to make progress. + pausable_failpoint!("sk-delete-timeline-remote-pause"); + + fail::fail_point!("sk-delete-timeline-remote", |_| { + Err(anyhow::anyhow!("failpoint: sk-delete-timeline-remote")) + }); + let cancel = CancellationToken::new(); // not really used backoff::retry( || async { diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs index cc9d4e6e3b..aab82fedb5 100644 --- a/safekeeper/src/wal_reader_stream.rs +++ b/safekeeper/src/wal_reader_stream.rs @@ -246,7 +246,7 @@ mod tests { .unwrap(); let resident_tli = tli.wal_residence_guard().await.unwrap(); - let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, c"neon-file:", None) .await .unwrap(); let end_pos = end_watch.get(); diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index b63ba154da..6b657b5ea0 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -21,6 +21,7 @@ clap.workspace = true cron.workspace = true fail.workspace = true futures.workspace = true +governor.workspace = true hex.workspace = true hyper0.workspace = true humantime.workspace = true diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 5b5ae80eaf..3e448d7013 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1,5 +1,5 @@ use std::str::FromStr; -use std::sync::Arc; +use std::sync::{Arc, LazyLock}; use std::time::{Duration, Instant}; use anyhow::Context; @@ -33,6 +33,7 @@ use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; use pageserver_client::{BlockUnblock, mgmt_api}; use routerify::Middleware; use tokio_util::sync::CancellationToken; +use tracing::warn; use utils::auth::{Scope, SwappableJwtAuth}; use utils::id::{NodeId, TenantId, TimelineId}; @@ -49,6 +50,7 @@ use crate::service::{LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIME pub struct HttpState { service: Arc, auth: Option>, + rate_limiter: governor::DefaultKeyedRateLimiter, neon_metrics: NeonMetrics, allowlist_routes: &'static [&'static str], } @@ -59,9 +61,11 @@ impl HttpState { auth: Option>, build_info: BuildInfo, ) -> Self { + let quota = governor::Quota::per_second(service.get_config().tenant_rate_limit); Self { service, auth, + rate_limiter: governor::RateLimiter::keyed(quota), neon_metrics: NeonMetrics::new(build_info), allowlist_routes: &[ "/status", @@ -82,6 +86,40 @@ fn get_state(request: &Request) -> &HttpState { .as_ref() } +/// Rate limits tenant requests. +/// +/// TODO: this should be a request middleware, but requires us to extract the tenant ID from +/// different URLs in a systematic way. +/// +/// TODO: consider returning a 429 response if these start piling up. +async fn maybe_rate_limit(request: &Request, tenant_id: TenantId) { + // Check if the tenant should be rate-limited. + let rate_limiter = &get_state(request).rate_limiter; + if rate_limiter.check_key(&tenant_id).is_ok() { + return; + } + + // Measure the rate limiting delay. + let _timer = METRICS_REGISTRY + .metrics_group + .storage_controller_http_request_rate_limited + .start_timer(); + + // Log rate limited tenants once every 10 seconds. + static LOG_RATE_LIMITER: LazyLock> = + LazyLock::new(|| { + let quota = governor::Quota::with_period(Duration::from_secs(10)).unwrap(); + governor::RateLimiter::keyed(quota) + }); + + if LOG_RATE_LIMITER.check_key(&tenant_id).is_ok() { + warn!("tenant {tenant_id} is rate limited") + } + + // Wait for quota. + rate_limiter.until_key_ready(&tenant_id).await; +} + /// Pageserver calls into this on startup, to learn which tenants it should attach async fn handle_re_attach(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; @@ -247,6 +285,7 @@ async fn handle_tenant_config_get( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -264,6 +303,7 @@ async fn handle_tenant_time_travel_remote_storage( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -311,6 +351,7 @@ async fn handle_tenant_secondary_download( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -329,6 +370,7 @@ async fn handle_tenant_delete( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -356,6 +398,7 @@ async fn handle_tenant_timeline_create( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -381,6 +424,7 @@ async fn handle_tenant_timeline_delete( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -457,6 +501,7 @@ async fn handle_tenant_timeline_archival_config( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -482,6 +527,7 @@ async fn handle_tenant_timeline_detach_ancestor( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -504,6 +550,7 @@ async fn handle_tenant_timeline_block_unblock_gc( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; @@ -521,12 +568,14 @@ async fn handle_tenant_timeline_download_heatmap_layers( let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_shard_id.tenant_id).await; let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; let concurrency: Option = parse_query_param(&req, "concurrency")?; + let recurse = parse_query_param(&req, "recurse")?.unwrap_or(false); service - .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency, recurse) .await?; json_response(StatusCode::OK, ()) @@ -547,8 +596,9 @@ async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let tenant_or_shard_id: TenantShardId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_or_shard_id.tenant_id).await; let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -562,15 +612,28 @@ async fn handle_tenant_timeline_passthrough( return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); }; - tracing::info!("Proxying request for tenant {} ({})", tenant_id, path); + tracing::info!( + "Proxying request for tenant {} ({})", + tenant_or_shard_id.tenant_id, + path + ); // Find the node that holds shard zero - let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?; + let (node, tenant_shard_id) = if tenant_or_shard_id.is_unsharded() { + service + .tenant_shard0_node(tenant_or_shard_id.tenant_id) + .await? + } else { + ( + service.tenant_shard_node(tenant_or_shard_id).await?, + tenant_or_shard_id, + ) + }; // Callers will always pass an unsharded tenant ID. Before proxying, we must // rewrite this to a shard-aware shard zero ID. let path = format!("{}", path); - let tenant_str = tenant_id.to_string(); + let tenant_str = tenant_or_shard_id.tenant_id.to_string(); let tenant_shard_str = format!("{}", tenant_shard_id); let path = path.replace(&tenant_str, &tenant_shard_str); @@ -610,7 +673,7 @@ async fn handle_tenant_timeline_passthrough( // Transform 404 into 503 if we raced with a migration if resp.status() == reqwest::StatusCode::NOT_FOUND { // Look up node again: if we migrated it will be different - let (new_node, _tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?; + let new_node = service.tenant_shard_node(tenant_shard_id).await?; if new_node.get_id() != node.get_id() { // Rather than retry here, send the client a 503 to prompt a retry: this matches // the pageserver's use of 503, and all clients calling this API should retry on 503. @@ -640,6 +703,7 @@ async fn handle_tenant_locate( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -655,9 +719,9 @@ async fn handle_tenant_describe( service: Arc, req: Request, ) -> Result, ApiError> { - check_permissions(&req, Scope::Scrubber)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::Scrubber)?; + // NB: don't rate limit: scrubber operation. match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -992,6 +1056,7 @@ async fn handle_tenant_shard_split( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1014,6 +1079,7 @@ async fn handle_tenant_shard_migrate( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1037,6 +1103,7 @@ async fn handle_tenant_shard_migrate_secondary( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1060,6 +1127,7 @@ async fn handle_tenant_shard_cancel_reconcile( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1079,6 +1147,7 @@ async fn handle_tenant_shard_cancel_reconcile( async fn handle_tenant_update_policy(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1134,9 +1203,9 @@ async fn handle_step_down(req: Request) -> Result, ApiError } async fn handle_tenant_drop(req: Request) -> Result, ApiError> { - check_permissions(&req, Scope::PageServerApi)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1151,9 +1220,9 @@ async fn handle_tenant_drop(req: Request) -> Result, ApiErr } async fn handle_tenant_import(req: Request) -> Result, ApiError> { - check_permissions(&req, Scope::PageServerApi)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 380ffeb9b7..6ef17c0007 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,3 +1,4 @@ +use std::num::NonZeroU32; use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; @@ -98,6 +99,10 @@ struct Cli { #[arg(long)] priority_reconciler_concurrency: Option, + /// Tenant API rate limit, as requests per second per tenant. + #[arg(long, default_value = "10")] + tenant_rate_limit: NonZeroU32, + /// How long to wait for the initial database connection to be available. #[arg(long, default_value = "5s")] db_connect_timeout: humantime::Duration, @@ -339,6 +344,7 @@ async fn async_main() -> anyhow::Result<()> { priority_reconciler_concurrency: args .priority_reconciler_concurrency .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT), + tenant_rate_limit: args.tenant_rate_limit, split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index f490edb68f..ea390df726 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -76,6 +76,10 @@ pub(crate) struct StorageControllerMetricGroup { pub(crate) storage_controller_http_request_latency: measured::HistogramVec, + /// HTTP rate limiting latency across all tenants and endpoints + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 10.0))] + pub(crate) storage_controller_http_request_rate_limited: measured::Histogram<10>, + /// Count of HTTP requests to the pageserver that resulted in an error, /// broken down by the pageserver node id, request name and method pub(crate) storage_controller_pageserver_request_error: diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index e9c54414a3..d6127c355a 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -281,13 +281,19 @@ impl PageserverClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<()> { measured_request!( "download_heatmap_layers", crate::metrics::Method::Post, &self.node_id_label, self.inner - .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .timeline_download_heatmap_layers( + tenant_shard_id, + timeline_id, + concurrency, + recurse + ) .await ) } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index d53b3d6598..8fc7f7a0c5 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5,6 +5,7 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet}; use std::error::Error; +use std::num::NonZeroU32; use std::ops::Deref; use std::path::PathBuf; use std::str::FromStr; @@ -365,6 +366,10 @@ pub struct Config { /// How many high-priority Reconcilers may be spawned concurrently pub priority_reconciler_concurrency: usize, + /// How many API requests per second to allow per tenant, across all + /// tenant-scoped API endpoints. Further API requests queue until ready. + pub tenant_rate_limit: NonZeroU32, + /// How large must a shard grow in bytes before we split it? /// None disables auto-splitting. pub split_threshold: Option, @@ -3781,6 +3786,7 @@ impl Service { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<(), ApiError> { let _tenant_lock = trace_shared_lock( &self.tenant_op_locks, @@ -3818,7 +3824,12 @@ impl Service { targets, |tenant_shard_id, client| async move { client - .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .timeline_download_heatmap_layers( + tenant_shard_id, + timeline_id, + concurrency, + recurse, + ) .await }, 1, @@ -4165,16 +4176,14 @@ impl Service { }).await? } - /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this - /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound) + /// When you know the TenantId but not a specific shard, and would like to get the node holding shard 0. pub(crate) async fn tenant_shard0_node( &self, tenant_id: TenantId, ) -> Result<(Node, TenantShardId), ApiError> { - // Look up in-memory state and maybe use the node from there. - { + let tenant_shard_id = { let locked = self.inner.read().unwrap(); - let Some((tenant_shard_id, shard)) = locked + let Some((tenant_shard_id, _shard)) = locked .tenants .range(TenantShardId::tenant_range(tenant_id)) .next() @@ -4184,6 +4193,29 @@ impl Service { )); }; + *tenant_shard_id + }; + + self.tenant_shard_node(tenant_shard_id) + .await + .map(|node| (node, tenant_shard_id)) + } + + /// When you need to send an HTTP request to the pageserver that holds a shard of a tenant, this + /// function looks up and returns node. If the shard isn't found, returns Err(ApiError::NotFound) + pub(crate) async fn tenant_shard_node( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + // Look up in-memory state and maybe use the node from there. + { + let locked = self.inner.read().unwrap(); + let Some(shard) = locked.tenants.get(&tenant_shard_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard {tenant_shard_id} not found").into(), + )); + }; + let Some(intent_node_id) = shard.intent.get_attached() else { tracing::warn!( tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), @@ -4204,7 +4236,7 @@ impl Service { "Shard refers to nonexistent node" ))); }; - return Ok((node.clone(), *tenant_shard_id)); + return Ok(node.clone()); } }; @@ -4212,29 +4244,34 @@ impl Service { // generation state: this will reflect the progress of any ongoing migration. // Note that it is not guaranteed to _stay_ here, our caller must still handle // the case where they call through to the pageserver and get a 404. - let db_result = self.persistence.tenant_generations(tenant_id).await?; + let db_result = self + .persistence + .tenant_generations(tenant_shard_id.tenant_id) + .await?; let Some(ShardGenerationState { - tenant_shard_id, + tenant_shard_id: _, generation: _, generation_pageserver: Some(node_id), - }) = db_result.first() + }) = db_result + .into_iter() + .find(|s| s.tenant_shard_id == tenant_shard_id) else { // This can happen if we raced with a tenant deletion or a shard split. On a retry // the caller will either succeed (shard split case), get a proper 404 (deletion case), // or a conflict response (case where tenant was detached in background) return Err(ApiError::ResourceUnavailable( - "Shard {} not found in database, or is not attached".into(), + format!("Shard {tenant_shard_id} not found in database, or is not attached").into(), )); }; let locked = self.inner.read().unwrap(); - let Some(node) = locked.nodes.get(node_id) else { + let Some(node) = locked.nodes.get(&node_id) else { // This should never happen return Err(ApiError::InternalServerError(anyhow::anyhow!( "Shard refers to nonexistent node" ))); }; - Ok((node.clone(), *tenant_shard_id)) + Ok(node.clone()) } pub(crate) fn tenant_locate( diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index 2ff68d7037..a0419e0205 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -46,48 +46,51 @@ impl ChaosInjector { } } + fn get_cron_interval_sleep_future(&self) -> Option { + if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab { + match cron_to_next_duration(chaos_exit_crontab) { + Ok(interval_exit) => Some(interval_exit), + Err(e) => { + tracing::error!("Error processing the cron schedule: {e}"); + None + } + } + } else { + None + } + } + pub async fn run(&mut self, cancel: CancellationToken) { let mut interval = tokio::time::interval(self.interval); - let cron_interval = { - if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab { - match cron_to_next_duration(chaos_exit_crontab) { - Ok(interval_exit) => Some(interval_exit), - Err(e) => { - tracing::error!("Error processing the cron schedule: {e}"); - None - } - } - } else { - None - } - }; + #[derive(Debug)] enum ChaosEvent { ShuffleTenant, ForceKill, } - let chaos_type = tokio::select! { - _ = interval.tick() => { - ChaosEvent::ShuffleTenant - } - Some(_) = maybe_sleep(cron_interval) => { - ChaosEvent::ForceKill - } - _ = cancel.cancelled() => { - tracing::info!("Shutting down"); - return; - } - }; - - match chaos_type { - ChaosEvent::ShuffleTenant => { - self.inject_chaos().await; - } - ChaosEvent::ForceKill => { - self.force_kill().await; + loop { + let cron_interval = self.get_cron_interval_sleep_future(); + let chaos_type = tokio::select! { + _ = interval.tick() => { + ChaosEvent::ShuffleTenant + } + Some(_) = maybe_sleep(cron_interval) => { + ChaosEvent::ForceKill + } + _ = cancel.cancelled() => { + tracing::info!("Shutting down"); + return; + } + }; + tracing::info!("Chaos iteration: {chaos_type:?}..."); + match chaos_type { + ChaosEvent::ShuffleTenant => { + self.inject_chaos().await; + } + ChaosEvent::ForceKill => { + self.force_kill().await; + } } } - - tracing::info!("Chaos iteration..."); } /// If a shard has a secondary and attached location, then re-assign the secondary to be diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 97a5a36814..6e53987e7c 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -525,12 +525,14 @@ class NeonLocalCli(AbstractNeonCli): def endpoint_start( self, endpoint_id: str, + safekeepers_generation: int | None = None, safekeepers: list[int] | None = None, remote_ext_config: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, create_test_user: bool = False, basebackup_request_tries: int | None = None, + timeout: str | None = None, env: dict[str, str] | None = None, ) -> subprocess.CompletedProcess[str]: args = [ @@ -543,6 +545,8 @@ class NeonLocalCli(AbstractNeonCli): if remote_ext_config is not None: args.extend(["--remote-ext-config", remote_ext_config]) + if safekeepers_generation is not None: + args.extend(["--safekeepers-generation", str(safekeepers_generation)]) if safekeepers is not None: args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) if endpoint_id is not None: @@ -553,6 +557,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--allow-multiple"]) if create_test_user: args.extend(["--create-test-user"]) + if timeout is not None: + args.extend(["--start-timeout", str(timeout)]) res = self.raw_cli(args, extra_env_vars) res.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index cd197d8e77..0065a8a3fa 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -253,10 +253,15 @@ class PgProtocol: # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. + # pooler does not support statement_timeout + # Check if the hostname contains the string 'pooler' + hostname = result.get("host", "") + log.info(f"Hostname: {hostname}") options = result.get("options", "") - if "statement_timeout" not in options: + if "statement_timeout" not in options and "pooler" not in hostname: options = f"-cstatement_timeout=120s {options}" result["options"] = options + return result # autocommit=True here by default because that's what we need most of the time @@ -1176,15 +1181,6 @@ class NeonEnv: "max_batch_size": 32, } - if config.test_may_use_compatibility_snapshot_binaries: - log.info( - "Skipping prev heatmap settings to avoid forward-compatibility related test failures" - ) - else: - # Look for gaps in WAL received from safekeepeers - ps_cfg["load_previous_heatmap"] = True - ps_cfg["generate_unarchival_heatmap"] = True - get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io if get_vectored_concurrent_io is not None: ps_cfg["get_vectored_concurrent_io"] = { @@ -1199,6 +1195,9 @@ class NeonEnv: config.pageserver_default_tenant_config_compaction_algorithm ) + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["rel_size_v2_enabled"] = True # Enable relsize_v2 by default in tests + if self.pageserver_remote_storage is not None: ps_cfg["remote_storage"] = remote_storage_to_toml_dict( self.pageserver_remote_storage @@ -2479,12 +2478,21 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return [TenantShardId.parse(tid) for tid in response.json()["updated"]] - def download_heatmap_layers(self, tenant_shard_id: TenantShardId, timeline_id: TimelineId): + def download_heatmap_layers( + self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, recurse: bool | None = None + ): + url = ( + f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers" + ) + if recurse is not None: + url = url + f"?recurse={str(recurse).lower()}" + response = self.request( "POST", - f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", + url, headers=self.headers(TokenScope.ADMIN), ) + response.raise_for_status() def __enter__(self) -> Self: @@ -3602,6 +3610,7 @@ class NeonProxy(PgProtocol): "project_id": "test_project_id", "endpoint_id": "test_endpoint_id", "branch_id": "test_branch_id", + "compute_id": "test_compute_id", }, } }, @@ -3827,6 +3836,7 @@ def static_auth_broker( { "address": local_proxy_addr, "aux": { + "compute_id": "compute-foo-bar-1234-5678", "endpoint_id": "ep-foo-bar-1234", "branch_id": "br-foo-bar", "project_id": "foo-bar", @@ -3997,10 +4007,12 @@ class Endpoint(PgProtocol, LogUtils): self, remote_ext_config: str | None = None, pageserver_id: int | None = None, + safekeeper_generation: int | None = None, safekeepers: list[int] | None = None, allow_multiple: bool = False, create_test_user: bool = False, basebackup_request_tries: int | None = None, + timeout: str | None = None, env: dict[str, str] | None = None, ) -> Self: """ @@ -4010,19 +4022,21 @@ class Endpoint(PgProtocol, LogUtils): assert self.endpoint_id is not None - # If `safekeepers` is not None, they are remember them as active and use - # in the following commands. + # If `safekeepers` is not None, remember them as active and use in the + # following commands. if safekeepers is not None: self.active_safekeepers = safekeepers self.env.neon_cli.endpoint_start( self.endpoint_id, + safekeepers_generation=safekeeper_generation, safekeepers=self.active_safekeepers, remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, allow_multiple=allow_multiple, create_test_user=create_test_user, basebackup_request_tries=basebackup_request_tries, + timeout=timeout, env=env, ) self._running.release(1) diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 4fce558840..abddfa2768 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -124,6 +124,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ # controller's attempts to notify the endpoint). ".*reconciler.*neon_local notification hook failed.*", ".*reconciler.*neon_local error.*", + # Tenant rate limits may fire in tests that submit lots of API requests. + ".*tenant \\S+ is rate limited.*", ] diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 364aff325d..0efe0b9575 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -375,6 +375,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params) self.verbose_error(res) + def timeline_patch_index_part( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + data: dict[str, Any], + ): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/patch_index_part", + json=data, + ) + self.verbose_error(res) + return res.json() + def tenant_location_conf( self, tenant_id: TenantId | TenantShardId, diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 4df2b2df2b..cac84c07e7 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -282,6 +282,17 @@ class S3Storage: def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str: return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + def safekeeper_tenants_path(self) -> str: + return f"{self.prefix_in_bucket}" + + def safekeeper_tenant_path(self, tenant_id: TenantShardId | TenantId) -> str: + return f"{self.safekeeper_tenants_path()}/{tenant_id}" + + def safekeeper_timeline_path( + self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId + ) -> str: + return f"{self.safekeeper_tenant_path(tenant_id)}/{timeline_id}" + def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str: """ Gets the latest generation key from a list of keys. diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 7038d87aba..e409151b76 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -229,13 +229,14 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): # only_local doesn't remove segments in the remote storage. def timeline_delete( - self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False + self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False, **kwargs ) -> dict[Any, Any]: res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", params={ "only_local": str(only_local).lower(), }, + **kwargs, ) res.raise_for_status() res_json = res.json() diff --git a/test_runner/fixtures/safekeeper_utils.py b/test_runner/fixtures/safekeeper_utils.py new file mode 100644 index 0000000000..158baf7bb6 --- /dev/null +++ b/test_runner/fixtures/safekeeper_utils.py @@ -0,0 +1,92 @@ +from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonPageserver, Safekeeper +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.utils import get_dir_size + + +def is_segment_offloaded( + sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn +): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.backup_lsn >= seg_end + + +def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.flush_lsn >= lsn + + +def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id)) + sk_wal_size_mb = sk_wal_size / 1024 / 1024 + log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") + return sk_wal_size_mb <= target_size_mb + + +def wait_lsn_force_checkpoint( + tenant_id: TenantId, + timeline_id: TimelineId, + endpoint: Endpoint, + ps: NeonPageserver, + pageserver_conn_options=None, +): + pageserver_conn_options = pageserver_conn_options or {} + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") + + wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at_sk( + safekeeper: Safekeeper, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id) + wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at( + lsn: Lsn, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + """ + Wait until pageserver receives given lsn, force checkpoint and wait for + upload, i.e. remote_consistent_lsn advancement. + """ + pageserver_conn_options = pageserver_conn_options or {} + + auth_token = None + if "password" in pageserver_conn_options: + auth_token = pageserver_conn_options["password"] + + # wait for the pageserver to catch up + wait_for_last_record_lsn( + ps.http_client(auth_token=auth_token), + tenant_id, + timeline_id, + lsn, + ) + + # force checkpoint to advance remote_consistent_lsn + ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) + + # ensure that remote_consistent_lsn is advanced + wait_for_upload( + ps.http_client(auth_token=auth_token), + tenant_id, + timeline_id, + lsn, + ) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 84d62fb877..d1b2a5a400 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -337,6 +337,8 @@ def allure_add_grafana_link(host: str, timeline_id: TimelineId, start_ms: int, e """ # We expect host to be in format like ep-holy-mouse-w2u462gi.us-east-2.aws.neon.build endpoint_id, region_id, _ = host.split(".", 2) + # Remove "-pooler" suffix if present + endpoint_id = endpoint_id.removesuffix("-pooler") params = { "orgId": 1, diff --git a/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql new file mode 100644 index 0000000000..69e6366a53 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql @@ -0,0 +1,47 @@ +\set event_type random(1,10) +\set service_key random(1, 3) + +INSERT INTO webhook.incoming_webhooks ( + created_at, + delivery_id, + upstream_emitted_at, + service_key, + event_id, + source, + body, + json, + additional_data, + is_body_encrypted, + event_type +) VALUES ( + now(), + gen_random_uuid(), + now() - interval '10 minutes', + CASE :service_key::int + WHEN 1 THEN 'shopify' + WHEN 2 THEN 'stripe' + WHEN 3 THEN 'github' + END, + 'evt_' || gen_random_uuid(), -- Ensures uniqueness + CASE :service_key::int + WHEN 1 THEN 'Shopify' + WHEN 2 THEN 'Stripe' + WHEN 3 THEN 'GitHub' + END, + '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}', + '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}'::jsonb, + '{"metadata": {"user_agent": "Mozilla/5.0", "ip_address": "203.0.113.42"}}'::jsonb, + false, + CASE :event_type::int + WHEN 1 THEN 'ORDER_PLACED' + WHEN 2 THEN 'ORDER_CANCELLED' + WHEN 3 THEN 'PAYMENT_SUCCESSFUL' + WHEN 4 THEN 'PAYMENT_FAILED' + WHEN 5 THEN 'CUSTOMER_CREATED' + WHEN 6 THEN 'CUSTOMER_UPDATED' + WHEN 7 THEN 'PRODUCT_UPDATED' + WHEN 8 THEN 'INVENTORY_LOW' + WHEN 9 THEN 'SHIPPING_DISPATCHED' + WHEN 10 THEN 'REFUND_ISSUED' + END +); \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql new file mode 100644 index 0000000000..b2f173f011 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql @@ -0,0 +1,15 @@ +-- Zipfian distributions model real-world access patterns where: +-- A few values (popular IDs) are accessed frequently. +-- Many values are accessed rarely. +-- This is useful for simulating realistic workloads, like webhook processing where recent events are more frequently accessed. + +\set alpha 1.2 +\set min_id 1 +\set max_id 135000000 + +\set zipf_random_id random_zipfian(:min_id, :max_id, :alpha) + +SELECT * +FROM webhook.incoming_webhooks +WHERE id = (:zipf_random_id)::bigint +LIMIT 1; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql new file mode 100644 index 0000000000..78a843bf0f --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql @@ -0,0 +1,9 @@ +-- select one of the most recent webhook records (created in the branch timeline during the bench run) +SELECT * +FROM webhook.incoming_webhooks +WHERE id = ( + SELECT (floor(random() * ( + (SELECT last_value FROM webhook.incoming_webhooks_id_seq) - 1350000001 + 1 + ) + 1350000001))::bigint +) +LIMIT 1; \ No newline at end of file diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py index 2570c55f6c..e2f0a79018 100644 --- a/test_runner/performance/test_perf_many_relations.py +++ b/test_runner/performance/test_perf_many_relations.py @@ -83,6 +83,13 @@ def test_perf_simple_many_relations_reldir_v2( ], ) + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + != "legacy" + ) + n = 100000 step = 5000 # Create many relations diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py new file mode 100644 index 0000000000..ae00dbb3b5 --- /dev/null +++ b/test_runner/performance/test_perf_oltp_large_tenant.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import os +import timeit +from pathlib import Path + +import pytest +from fixtures.benchmark_fixture import PgBenchRunResult +from fixtures.compare_fixtures import PgCompare + +from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp + + +def get_custom_scripts( + default: str = "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4", +) -> list[str]: + # We parametrize each run with the custom scripts to run and their weights. + # The custom scripts and their weights are passed through TEST_PGBENCH_CUSTOM_SCRIPTS env variable. + # Delimit the custom scripts for one run by spaces and for different runs by commas, for example: + # "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4,insert_webhooks.sql@8 select_any_webhook_with_skew.sql@2" + # Databases/branches are pre-created and passed through BENCHMARK_CONNSTR env variable. + scripts = os.getenv("TEST_PGBENCH_CUSTOM_SCRIPTS", default=str(default)) + rv = [] + for s in scripts.split(","): + rv.append(s) + return rv + + +def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int): + password = env.pg.default_options.get("password", None) + options = env.pg.default_options.get("options", "") + # drop password from the connection string by passing password=None and set password separately + connstr = env.pg.connstr(password=None, options=options) + # if connstr does not contain pooler we can set statement_timeout to 0 + if "pooler" not in connstr: + options = "-cstatement_timeout=0 " + env.pg.default_options.get("options", "") + connstr = env.pg.connstr(password=None, options=options) + + script_args = [ + "pgbench", + "-n", # no explicit vacuum before the test - we want to rely on auto-vacuum + "-M", + "prepared", + "--client=500", + "--jobs=100", + f"-T{duration}", + "-P60", # progress every minute + "--progress-timestamp", + ] + for script in custom_scripts.split(): + script_args.extend(["-f", f"test_runner/performance/large_synthetic_oltp/{script}"]) + script_args.append(connstr) + + run_pgbench( + env, + "custom-scripts", + script_args, + password=password, + ) + + +def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None): + environ: dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = env.pg_bin.run_capture(cmdline, env=environ) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + env.flush() + + stdout = Path(f"{out}.stdout").read_text() + + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + env.zenbenchmark.record_pg_bench_result(prefix, res) + + +@pytest.mark.parametrize("custom_scripts", get_custom_scripts()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_perf_oltp_large_tenant(remote_compare: PgCompare, custom_scripts: str, duration: int): + run_test_pgbench(remote_compare, custom_scripts, duration) + # todo: run re-index, analyze, vacuum, etc. after the test and measure and report its duration diff --git a/test_runner/regress/data/test_signed_char.out b/test_runner/regress/data/test_signed_char.out new file mode 100644 index 0000000000..a68876e383 --- /dev/null +++ b/test_runner/regress/data/test_signed_char.out @@ -0,0 +1 @@ +0000000094010815f81f042000000000b89f8000909f5000689f5000489f4000309f3000189f3000009f3000e89e3000d09e3000b89e3000a09e3000889e3000709e3000309e8000189e3000009e3000e89d3000d09d3000b89d3000a09d3000889d3000709d3000589d3000409d3000289d3000109d3000f89c3000e09c3000c89c3000b09c3000989c3000809c3000689c3000509c3000389c3000209c3000089c3000f09b3000d89b3000c09b3000a89b3000909b3000789b3000609b3000489b3000309b3000189b3000009b3000e89a3000d09a3000b89a3000a09a3000889a3000489a8000309a3000189a3000009a3000e8993000d0993000b8993000a09930008899300070993000589930004099300000998000e8983000d0983000b8983000a0983000889830007098300058983000409830002898300010983000f8973000b8978000a09730008897300070973000589730004097300028973000e8968000a89680006896800028968000e8958000a8958000909530005095800038953000209530000895300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000801000010018004c198900000000000000000029000000008010000100180049787f000000000000000000290000000080100001001800727c7000000000000000000029000000008010002800400020766200000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100001001800207262000000000000000000290000000080100028004000766239000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000280040006239380000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100008010002800400039383700000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100028004000383736000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000280040003736350000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100008010002800400036353400000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100001001800203034000000000000000000280000000080100001001800203933000000000000000000270000000080100001001800203833000000000000000000260000000080100001001800203733000000000000000000250000000080100001001800203633000000000000000000240000000080100001001800203533000000000000000000230000000080100028004000353433000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002034330000000000000000002200000000801000010018002033330000000000000000002100000000801000010018002032330000000000000000002000000000801000010018002031330000000000000000001f00000000801000010018002030330000000000000000001e00000000801000010018002039320000000000000000001d00000000801000010018002038320000000000000000001c00000000801000010018002037320000000000000000001b00000000801000010018002036320000000000000000001a0000000080100001001800203532000000000000000000190000000080100001001800203432000000000000000000180000000080100028004000343332000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002033320000000000000000001700000000801000010018002032320000000000000000001600000000801000010018002031320000000000000000001500000000801000010018002030320000000000000000001400000000801000010018002039310000000000000000001300000000801000010018002038310000000000000000001200000000801000010018002037310000000000000000001100000000801000010018002036310000000000000000001000000000801000010018002035310000000000000000000f00000000801000010018002034310000000000000000000e00000000801000010018002033310000000000000000000d0000000080100028004000333231000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002032310000000000000000000c00000000801000010018002031310000000000000000000b00000000801000010018002030310000000000000000000a00000000801000010018002039200000000000000000000900000000801000010018002038200000000000000000000800000000801000010018002037200000000000000000000700000000801000010018002036200000000000000000000600000000801000010018002035200000000000000000000500000000801000010018003034200000000000000000002800000000801000010018002034200000000000000000000400000000801000010018003933200000000000000000002700000000801000010018003833200000000000000000002600000000801000010018003733200000000000000000002500000000801000010018003633200000000000000000002400000000801000010018003533200000000000000000002300000000801000010018003433200000000000000000002200000000801000010018003333200000000000000000002100000000801000010018003233200000000000000000002000000000801000010018003133200000000000000000001f00000000801000010018003033200000000000000000001e00000000801000010018002033200000000000000000000300000000801000010018003932200000000000000000001d00000000801000010018003832200000000000000000001c00000000801000010018003732200000000000000000001b00000000801000010018003632200000000000000000001a00000000801000010018003532200000000000000000001900000000801000010018003432200000000000000000001800000000801000010018003332200000000000000000001700000000801000010018003232200000000000000000001600000000801000010018003132200000000000000000001500000000801000010018003032200000000000000000001400000000801000010018002032200000000000000000000200000000801000010018003931200000000000000000001300000000801000010018003831200000000000000000001200000000801000010018003731200000000000000000001100000000801000010018003631200000000000000000001000000000801000010018003531200000000000000000000f00000000801000010018003431200000000000000000000e00000000801000010018003331200000000000000000000d0000000080100028004000323120000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018003131200000000000000000000b00000000801000010018003031200000000000000000000a0000000080100001001800203120000000000000000000010000000080100001001800622020000000000000000000290000000080100001001800392020000000000000000000090000000080100001001800382020000000000000000000080000000080100001001800372020000000000000000000070000000080100001001800362020000000000000000000060000000080100001001800352020000000000000000000050000000080100002002000342020000000000000000000040001002400000000000000008010000b00280033202000000000000000000003000a001b010101010101010101000000000000008010000b00280032202000000000000000000002000a001201010101010101010100000000000000801000280040003120200000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100ffffffff00000200 \ No newline at end of file diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 3a08671bbf..ce655d22b5 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -5,34 +5,59 @@ import logging import requests from fixtures.neon_fixtures import NeonEnv, logical_replication_sync +TEST_ROLE_NAMES = [ + {"name": "neondb_owner"}, + {"name": "role with spaces"}, + {"name": "role with%20spaces "}, + {"name": "role with whitespaces "}, + {"name": "injective role with spaces'; SELECT pg_sleep(1000);"}, + {"name": "role with #pound-sign and &ersands=true"}, + {"name": "role with emoji 🌍"}, + {"name": "role \";with ';injections $$ $x$ $ %I !/\\&#@"}, + {"name": '"role in double quotes"'}, + {"name": "'role in single quotes'"}, +] + TEST_DB_NAMES = [ { "name": "neondb", - "owner": "cloud_admin", + "owner": "neondb_owner", }, { "name": "db with spaces", - "owner": "cloud_admin", + "owner": "role with spaces", }, { "name": "db with%20spaces ", - "owner": "cloud_admin", + "owner": "role with%20spaces ", }, { "name": "db with whitespaces ", - "owner": "cloud_admin", + "owner": "role with whitespaces ", }, { - "name": "injective db with spaces'; SELECT pg_sleep(10);", - "owner": "cloud_admin", + "name": "injective db with spaces'; SELECT pg_sleep(1000);", + "owner": "injective role with spaces'; SELECT pg_sleep(1000);", }, { "name": "db with #pound-sign and &ersands=true", - "owner": "cloud_admin", + "owner": "role with #pound-sign and &ersands=true", }, { "name": "db with emoji 🌍", - "owner": "cloud_admin", + "owner": "role with emoji 🌍", + }, + { + "name": "db \";with ';injections $$ $x$ $ %I !/\\&#@", + "owner": "role \";with ';injections $$ $x$ $ %I !/\\&#@", + }, + { + "name": '"db in double quotes"', + "owner": '"role in double quotes"', + }, + { + "name": "'db in single quotes'", + "owner": "'role in single quotes'", }, ] @@ -52,6 +77,7 @@ def test_compute_catalog(neon_simple_env: NeonEnv): **{ "skip_pg_catalog_updates": False, "cluster": { + "roles": TEST_ROLE_NAMES, "databases": TEST_DB_NAMES, }, } @@ -99,10 +125,10 @@ def test_compute_catalog(neon_simple_env: NeonEnv): ), f"Expected 404 status code, but got {e.response.status_code}" -def test_compute_create_databases(neon_simple_env: NeonEnv): +def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): """ - Test that compute_ctl can create and work with databases with special - characters (whitespaces, %, tabs, etc.) in the name. + Test that compute_ctl can create and work with databases and roles + with special characters (whitespaces, %, tabs, etc.) in the name. """ env = neon_simple_env @@ -116,6 +142,7 @@ def test_compute_create_databases(neon_simple_env: NeonEnv): **{ "skip_pg_catalog_updates": False, "cluster": { + "roles": TEST_ROLE_NAMES, "databases": TEST_DB_NAMES, }, } @@ -139,6 +166,43 @@ def test_compute_create_databases(neon_simple_env: NeonEnv): assert len(curr_db) == 1 assert curr_db[0] == db["name"] + for role in TEST_ROLE_NAMES: + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = %s", (role["name"],)) + catalog_role = cursor.fetchone() + assert catalog_role is not None + assert catalog_role[0] == role["name"] + + delta_operations = [] + for db in TEST_DB_NAMES: + delta_operations.append({"action": "delete_db", "name": db["name"]}) + for role in TEST_ROLE_NAMES: + delta_operations.append({"action": "delete_role", "name": role["name"]}) + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [], + "databases": [], + }, + "delta_operations": delta_operations, + } + ) + endpoint.reconfigure() + + for db in TEST_DB_NAMES: + with endpoint.cursor() as cursor: + cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (db["name"],)) + catalog_db = cursor.fetchone() + assert catalog_db is None + + for role in TEST_ROLE_NAMES: + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = %s", (role["name"],)) + catalog_role = cursor.fetchone() + assert catalog_role is None + def test_dropdb_with_subscription(neon_simple_env: NeonEnv): """ @@ -150,17 +214,19 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): # stuff into the spec.json file. endpoint = env.endpoints.create_start("main") + SUB_DB_NAME = "';subscriber_db $$ $x$ $;" + PUB_DB_NAME = "publisher_db" TEST_DB_NAMES = [ { "name": "neondb", "owner": "cloud_admin", }, { - "name": "subscriber_db", + "name": SUB_DB_NAME, "owner": "cloud_admin", }, { - "name": "publisher_db", + "name": PUB_DB_NAME, "owner": "cloud_admin", }, ] @@ -177,47 +243,47 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): ) endpoint.reconfigure() - # connect to the publisher_db and create a publication - with endpoint.cursor(dbname="publisher_db") as cursor: + # Connect to the PUB_DB_NAME and create a publication + with endpoint.cursor(dbname=PUB_DB_NAME) as cursor: cursor.execute("CREATE PUBLICATION mypub FOR ALL TABLES") cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');") cursor.execute("CREATE TABLE t(a int)") cursor.execute("INSERT INTO t VALUES (1)") cursor.execute("CHECKPOINT") - # connect to the subscriber_db and create a subscription - # Note that we need to create subscription with - connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''") - with endpoint.cursor(dbname="subscriber_db") as cursor: + # Connect to the SUB_DB_NAME and create a subscription + # Note that we need to create subscription with the following connstr: + connstr = endpoint.connstr(dbname=PUB_DB_NAME).replace("'", "''") + with endpoint.cursor(dbname=SUB_DB_NAME) as cursor: cursor.execute("CREATE TABLE t(a int)") cursor.execute( - f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub WITH (create_slot = false) " + f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub WITH (create_slot = false) " ) - # wait for the subscription to be active + # Wait for the subscription to be active logical_replication_sync( endpoint, endpoint, "mysub", - sub_dbname="subscriber_db", - pub_dbname="publisher_db", + sub_dbname=SUB_DB_NAME, + pub_dbname=PUB_DB_NAME, ) # Check that replication is working - with endpoint.cursor(dbname="subscriber_db") as cursor: + with endpoint.cursor(dbname=SUB_DB_NAME) as cursor: cursor.execute("SELECT * FROM t") rows = cursor.fetchall() assert len(rows) == 1 assert rows[0][0] == 1 - # drop the subscriber_db from the list + # Drop the SUB_DB_NAME from the list TEST_DB_NAMES_NEW = [ { "name": "neondb", "owner": "cloud_admin", }, { - "name": "publisher_db", + "name": PUB_DB_NAME, "owner": "cloud_admin", }, ] @@ -230,7 +296,7 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): "databases": TEST_DB_NAMES_NEW, }, "delta_operations": [ - {"action": "delete_db", "name": "subscriber_db"}, + {"action": "delete_db", "name": SUB_DB_NAME}, # also test the case when we try to delete a non-existent database # shouldn't happen in normal operation, # but can occur when failed operations are retried @@ -239,22 +305,22 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): } ) - logging.info("Reconfiguring the endpoint to drop the subscriber_db") + logging.info(f"Reconfiguring the endpoint to drop the {SUB_DB_NAME} database") endpoint.reconfigure() - # Check that the subscriber_db is dropped + # Check that the SUB_DB_NAME is dropped with endpoint.cursor() as cursor: - cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", ("subscriber_db",)) + cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (SUB_DB_NAME,)) catalog_db = cursor.fetchone() assert catalog_db is None - # Check that we can still connect to the publisher_db - with endpoint.cursor(dbname="publisher_db") as cursor: + # Check that we can still connect to the PUB_DB_NAME + with endpoint.cursor(dbname=PUB_DB_NAME) as cursor: cursor.execute("SELECT * FROM current_database()") curr_db = cursor.fetchone() assert curr_db is not None assert len(curr_db) == 1 - assert curr_db[0] == "publisher_db" + assert curr_db[0] == PUB_DB_NAME def test_compute_drop_role(neon_simple_env: NeonEnv): @@ -265,6 +331,7 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): """ env = neon_simple_env TEST_DB_NAME = "db_with_permissions" + TEST_GRANTEE = "'); MALFORMED SQL $$ $x$ $/;5%$ %I" endpoint = env.endpoints.create_start("main") @@ -301,16 +368,18 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): cursor.execute("create view test_view as select * from test_table") with endpoint.cursor(dbname=TEST_DB_NAME, user="neon") as cursor: - cursor.execute("create role readonly") + cursor.execute(f'create role "{TEST_GRANTEE}"') # We (`compute_ctl`) make 'neon' the owner of schema 'public' in the owned database. # Postgres has all sorts of permissions and grants that we may not handle well, # but this is the shortest repro grant for the issue # https://github.com/neondatabase/cloud/issues/13582 - cursor.execute("grant select on all tables in schema public to readonly") + cursor.execute(f'grant select on all tables in schema public to "{TEST_GRANTEE}"') # Check that role was created with endpoint.cursor() as cursor: - cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'") + cursor.execute( + "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE} + ) role = cursor.fetchone() assert role is not None @@ -318,7 +387,8 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): # that may block our ability to drop the role. with endpoint.cursor(dbname=TEST_DB_NAME) as cursor: cursor.execute( - "select grantor from information_schema.role_table_grants where grantee = 'readonly'" + "select grantor from information_schema.role_table_grants where grantee = %(grantee)s", + {"grantee": TEST_GRANTEE}, ) res = cursor.fetchall() assert len(res) == 2, f"Expected 2 table grants, got {len(res)}" @@ -332,7 +402,7 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): "delta_operations": [ { "action": "delete_role", - "name": "readonly", + "name": TEST_GRANTEE, }, ], } @@ -341,7 +411,9 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): # Check that role is dropped with endpoint.cursor() as cursor: - cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'") + cursor.execute( + "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE} + ) role = cursor.fetchone() assert role is None diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 7f12c14073..2ff525464d 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -137,6 +137,8 @@ def test_remote_extensions( metrics = parse_metrics(raw_metrics) remote_ext_requests = metrics.query_all( "compute_ctl_remote_ext_requests_total", + # Check that we properly report the filename in the metrics + {"filename": "anon.tar.zst"}, ) assert len(remote_ext_requests) == 1 for sample in remote_ext_requests: diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 55fd7a8608..17ffeca23b 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -7,7 +7,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.utils import wait_until +from fixtures.utils import run_only_on_default_postgres, wait_until def check_client(env: NeonEnv, client: PageserverHttpClient): @@ -138,3 +138,25 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde with env.pageserver.http_client(auth_token=pageserver_token) as client: check_client(env, client) + + +@run_only_on_default_postgres("it does not use any postgres functionality") +def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + with env.pageserver.http_client() as client: + client.timeline_patch_index_part( + tenant_id, + timeline_id, + {"rel_size_migration": "migrating"}, + ) + assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "migrating" + # This is invalid in practice: we should never rollback the migrating state to legacy. + # But we do it here to test the API. + client.timeline_patch_index_part( + tenant_id, + timeline_id, + {"rel_size_migration": "legacy"}, + ) + assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy" diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index a9b897b741..b9e2934505 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -938,9 +938,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): # Expect lots of layers assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 - # Simulate large data by making layer downloads artifically slow for ps in env.pageservers: + # Simulate large data by making layer downloads artifically slow ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + # Make the initial logical size calculation lie. Otherwise it on demand downloads + # layers and makes accounting difficult. + ps.http_client().configure_failpoints(("skip-logical-size-calculation", "return")) def timeline_heatmap(tlid): assert env.pageserver_remote_storage is not None @@ -952,20 +955,16 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): raise RuntimeError(f"No heatmap for timeline: {tlid}") - # Upload a heatmap, so that secondaries have something to download - ps_attached.http_client().tenant_heatmap_upload(tenant_id) - heatmap_before_migration = timeline_heatmap(timeline_id) + def count_timeline_heatmap_layers(tlid) -> tuple[int, int]: + cold, hot = 0, 0 + layers = timeline_heatmap(tlid)["layers"] + for layer in layers: + if layer["cold"]: + cold += 1 + else: + hot += 1 - # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms. - # However, it pulls the heatmap, which will be important later. - http_client = env.storage_controller.pageserver_api() - (status, progress) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000) - assert status == 202 - assert progress["heatmap_mtime"] is not None - assert progress["layers_downloaded"] > 0 - assert progress["bytes_downloaded"] > 0 - assert progress["layers_total"] > progress["layers_downloaded"] - assert progress["bytes_total"] > progress["bytes_downloaded"] + return cold, hot env.storage_controller.allowed_errors.extend( [ @@ -975,6 +974,7 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): # Use a custom configuration that gives up earlier than usual. # We can't hydrate everything anyway because of the failpoints. + # Implicitly, this also uploads a heatmap from the current attached location. config = StorageControllerMigrationConfig( secondary_warmup_timeout="5s", secondary_download_request_timeout="2s" ) @@ -988,31 +988,33 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_heatmap_upload(tenant_id) heatmap_after_migration = timeline_heatmap(timeline_id) - assert len(heatmap_before_migration["layers"]) > 0 + local_layers = ps_secondary.list_layers(tenant_id, timeline_id) + # We download 1 layer per second and give up within 5 seconds. + assert len(local_layers) < 10 after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"]) - assert len(heatmap_before_migration["layers"]) <= after_migration_heatmap_layers_count - log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}") env.storage_controller.download_heatmap_layers( TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id ) - # Now simulate the case where a child timeline is archived, parent layers - # are evicted and the child is unarchived. When the child is unarchived, - # itself and the parent update their heatmaps to contain layers needed by the - # child. One can warm up the timeline hierarchy since the heatmaps are ready. - - def all_layers_downloaded(expected_layer_count: int): - local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id)) + def all_layers_downloaded(node, expected_layer_count: int): + local_layers_count = len(node.list_layers(tenant_id, timeline_id)) log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") assert local_layers_count >= expected_layer_count - wait_until(lambda: all_layers_downloaded(after_migration_heatmap_layers_count)) - ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + def no_layers_downloaded(node): + local_layers_count = len(node.list_layers(tenant_id, timeline_id)) + log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") + assert local_layers_count == 0 + + wait_until(lambda: all_layers_downloaded(ps_secondary, after_migration_heatmap_layers_count)) + + # Read everything and make sure that we're not downloading anything extra. + # All hot layers should be available locally now. before = ( ps_secondary.http_client() .get_metrics() @@ -1030,6 +1032,11 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): workload.stop() assert before == after + # Now simulate the case where a child timeline is archived, parent layers + # are evicted and the child is unarchived. When the child is unarchived, + # itself and the parent update their heatmaps to contain layers needed by the + # child. One can warm up the timeline hierarchy since the heatmaps are ready. + def check_archival_state(state: TimelineArchivalState, tline): timelines = ( timeline["timeline_id"] @@ -1057,13 +1064,35 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): wait_until(lambda: check_archival_state(TimelineArchivalState.UNARCHIVED, child_timeline_id)) ps_secondary.http_client().tenant_heatmap_upload(tenant_id) - log.info(f"Parent timeline heatmap size: {len(timeline_heatmap(timeline_id)['layers'])}") - log.info(f"Child timeline heatmap size: {len(timeline_heatmap(child_timeline_id)['layers'])}") - expected_locally = len(timeline_heatmap(timeline_id)["layers"]) - assert expected_locally > 0 + parent_cold, parent_hot = count_timeline_heatmap_layers(timeline_id) + child_cold, child_hot = count_timeline_heatmap_layers(child_timeline_id) + + log.info(f"Parent timeline heatmap size: cold={parent_cold}, hot={parent_hot}") + log.info(f"Child timeline heatmap size: cold={child_cold}, hot={child_hot}") + + # All layers in the heatmap should come from the generation on unarchival. + # Hence, they should be cold. + assert parent_cold > 0 + assert parent_hot == 0 + + expected_locally = parent_cold env.storage_controller.download_heatmap_layers( - TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id + TenantShardId(tenant_id, shard_number=0, shard_count=0), child_timeline_id, recurse=True ) - wait_until(lambda: all_layers_downloaded(expected_locally)) + wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally)) + + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")]) + + # The uploaded heatmap is still empty. Clean up all layers on the secondary. + ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100) + wait_until(lambda: no_layers_downloaded(ps_attached)) + + # Upload a new heatmap. The previously cold layers become hot since they're now resident. + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + + # Warm up the current secondary. + ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100) + wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally)) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 6a76ad5ca8..1d9f385358 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -5,7 +5,7 @@ from __future__ import annotations from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Any, cast import pytest from fixtures.log_helper import log @@ -118,10 +118,20 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End pageserver.http_client().timeline_gc(shard, env.initial_timeline, None) +def patch_tenant_conf(tenant_conf: dict[str, Any], reldir_type: str) -> dict[str, Any]: + tenant_conf = tenant_conf.copy() + if reldir_type == "v2": + tenant_conf["rel_size_v2_enabled"] = "true" + else: + tenant_conf["rel_size_v2_enabled"] = "false" + return tenant_conf + + # Run the main PostgreSQL regression tests, in src/test/regress. # @pytest.mark.timeout(3000) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_pg_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -130,6 +140,7 @@ def test_pg_regress( base_dir: Path, pg_distrib_dir: Path, shard_count: int | None, + reldir_type: str, ): DBNAME = "regression" @@ -142,7 +153,7 @@ def test_pg_regress( neon_env_builder.enable_pageserver_remote_storage(s3_storage()) env = neon_env_builder.init_start( - initial_tenant_conf=TENANT_CONF, + initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type), initial_tenant_shard_count=shard_count, ) @@ -196,6 +207,7 @@ def test_pg_regress( # @pytest.mark.timeout(1500) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_isolation( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -204,6 +216,7 @@ def test_isolation( base_dir: Path, pg_distrib_dir: Path, shard_count: int | None, + reldir_type: str, ): DBNAME = "isolation_regression" @@ -211,7 +224,8 @@ def test_isolation( neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) env = neon_env_builder.init_start( - initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type), + initial_tenant_shard_count=shard_count, ) # Connect to postgres and create a database called "regression". @@ -267,6 +281,7 @@ def test_isolation( # Run extra Neon-specific pg_regress-based tests. The tests and their # schedule file are in the sql_regress/ directory. @pytest.mark.parametrize("shard_count", [None, 4]) +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_sql_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -275,6 +290,7 @@ def test_sql_regress( base_dir: Path, pg_distrib_dir: Path, shard_count: int | None, + reldir_type: str, ): DBNAME = "regression" @@ -282,7 +298,8 @@ def test_sql_regress( neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) env = neon_env_builder.init_start( - initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type), + initial_tenant_shard_count=shard_count, ) # Connect to postgres and create a database called "regression". @@ -345,9 +362,7 @@ def test_tx_abort_with_many_relations( """ env = neon_env_builder.init_start( - initial_tenant_conf={ - "rel_size_v2_enabled": "true" if reldir_type == "v2" else "false", - } + initial_tenant_conf=patch_tenant_conf({}, reldir_type), ) ep = env.endpoints.create_start( "main", @@ -358,14 +373,25 @@ def test_tx_abort_with_many_relations( ], ) + if reldir_type == "v1": + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + else: + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + != "legacy" + ) + # How many relations: this number is tuned to be long enough to take tens of seconds # if the rollback code path is buggy, tripping the test's timeout. - if reldir_type == "v1": - n = 4000 - step = 4000 - else: - n = 20000 - step = 5000 + n = 5000 + step = 2500 def create(): # Create many relations diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py index 3e29c92a96..07eacfc775 100644 --- a/test_runner/regress/test_relations.py +++ b/test_runner/regress/test_relations.py @@ -19,6 +19,17 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)") endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)") + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + + # Ensure the pageserver accepts the table creation SQLs before the migration. In theory, we can also do + # a "wait_flush_lsn" here, but it's easier to just do a restart. + env.pageserver.restart() + # Switch to v2 env.pageserver.http_client().update_tenant_config( env.initial_tenant, @@ -27,6 +38,13 @@ def test_pageserver_reldir_v2( }, ) + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + # Check if both relations are still accessible endpoint.safe_psql("SELECT * FROM foo1") endpoint.safe_psql("SELECT * FROM foo2") @@ -41,12 +59,14 @@ def test_pageserver_reldir_v2( # Create a relation in v2 endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)") + endpoint.safe_psql("CREATE TABLE foo4 (id INTEGER PRIMARY KEY, val text)") # Delete a relation in v1 endpoint.safe_psql("DROP TABLE foo1") # Check if both relations are still accessible endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("SELECT * FROM foo3") + endpoint.safe_psql("SELECT * FROM foo4") # Restart the endpoint endpoint.stop() @@ -57,7 +77,7 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("DROP TABLE IF EXISTS foo1") endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("SELECT * FROM foo3") - + endpoint.safe_psql("SELECT * FROM foo4") endpoint.safe_psql("DROP TABLE foo3") endpoint.stop() endpoint.start() @@ -66,3 +86,25 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("DROP TABLE IF EXISTS foo1") endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("DROP TABLE IF EXISTS foo3") + endpoint.safe_psql("SELECT * FROM foo4") + + # Set the config to false to emulate the case where the config is not persisted when the tenant gets detached/attached. + env.pageserver.http_client().update_tenant_config( + env.initial_tenant, + { + "rel_size_v2_enabled": False, + }, + ) + + # Check if the relation is still accessible + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo4") + + env.pageserver.restart() + + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "migrating" + ) diff --git a/test_runner/regress/test_safekeeper_deletion.py b/test_runner/regress/test_safekeeper_deletion.py new file mode 100644 index 0000000000..b46095d583 --- /dev/null +++ b/test_runner/regress/test_safekeeper_deletion.py @@ -0,0 +1,331 @@ +from __future__ import annotations + +import threading +import time +from contextlib import closing +from enum import StrEnum + +import pytest +import requests +from fixtures.common_types import Lsn, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnvBuilder, +) +from fixtures.remote_storage import S3Storage, s3_storage +from fixtures.safekeeper_utils import is_segment_offloaded +from fixtures.utils import wait_until + + +@pytest.mark.parametrize("auth_enabled", [False, True]) +def test_safekeeper_delete_timeline(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): + neon_env_builder.auth_enabled = auth_enabled + env = neon_env_builder.init_start() + + # FIXME: are these expected? + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found in global map.*", + ".*Timeline .* was cancelled and cannot be used anymore.*", + ] + ) + + # Create two tenants: one will be deleted, other should be preserved. + tenant_id = env.initial_tenant + timeline_id_1 = env.create_branch("br1") # Active, delete explicitly + timeline_id_2 = env.create_branch("br2") # Inactive, delete explicitly + timeline_id_3 = env.create_branch("br3") # Active, delete with the tenant + timeline_id_4 = env.create_branch("br4") # Inactive, delete with the tenant + + tenant_id_other, timeline_id_other = env.create_tenant() + + # Populate branches + endpoint_1 = env.endpoints.create_start("br1") + endpoint_2 = env.endpoints.create_start("br2") + endpoint_3 = env.endpoints.create_start("br3") + endpoint_4 = env.endpoints.create_start("br4") + endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other) + for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]: + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key)") + sk = env.safekeepers[0] + sk_data_dir = sk.data_dir + if not auth_enabled: + sk_http = sk.http_client() + sk_http_other = sk_http + else: + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + sk_http_other = sk.http_client( + auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) + ) + sk_http_noauth = sk.http_client(gen_sk_wide_token=False) + assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. + endpoint_2.stop_and_destroy() + endpoint_4.stop_and_destroy() + sk.stop() + sk.start() + + # Ensure connections to Safekeeper are established + for endpoint in [endpoint_1, endpoint_3, endpoint_other]: + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("INSERT INTO t (key) VALUES (1)") + + # Stop all computes gracefully before safekeepers stop responding to them + endpoint_1.stop_and_destroy() + endpoint_3.stop_and_destroy() + + # Remove initial tenant's br1 (active) + assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Ensure repeated deletion succeeds + assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + if auth_enabled: + # Ensure we cannot delete the other tenant + for sk_h in [sk_http, sk_http_noauth]: + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): + assert sk_h.timeline_delete(tenant_id_other, timeline_id_other) + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): + assert sk_h.tenant_delete_force(tenant_id_other) + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant's br2 (inactive) + assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove non-existing branch, should succeed + assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant fully (two branches are active) + response = sk_http.tenant_delete_force(tenant_id) + assert response[str(timeline_id_3)]["dir_existed"] + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant again. + response = sk_http.tenant_delete_force(tenant_id) + # assert response == {} + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Ensure the other tenant still works + sk_http_other.timeline_status(tenant_id_other, timeline_id_other) + with closing(endpoint_other.connect()) as conn: + with conn.cursor() as cur: + cur.execute("INSERT INTO t (key) VALUES (123)") + + +def test_safekeeper_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder): + """ + Test deleting timelines on a safekeeper while they're under load. + + This should not happen under normal operation, but it can happen if + there is some rogue compute/pageserver that is writing/reading to a + safekeeper that we're migrating a timeline away from, or if the timeline + is being deleted while such a rogue client is running. + """ + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + + # Create two endpoints that will generate load + timeline_id_a = env.create_branch("deleteme_a") + timeline_id_b = env.create_branch("deleteme_b") + + endpoint_a = env.endpoints.create("deleteme_a") + endpoint_a.start() + endpoint_b = env.endpoints.create("deleteme_b") + endpoint_b.start() + + # Get tenant and timeline IDs + tenant_id = env.initial_tenant + + # Start generating load on both timelines + def generate_load(endpoint: Endpoint): + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") + while True: + try: + cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'") + except: # noqa + # Ignore errors since timeline may be deleted + break + + t_a = threading.Thread(target=generate_load, args=(endpoint_a,)) + t_b = threading.Thread(target=generate_load, args=(endpoint_b,)) + try: + t_a.start() + t_b.start() + + # Let the load run for a bit + log.info("Warming up...") + time.sleep(2) + + # Safekeeper errors will propagate to the pageserver: it is correct that these are + # logged at error severity because they indicate the pageserver is trying to read + # a timeline that it shouldn't. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline.*was cancelled.*", + ".*Timeline.*was not found.*", + ] + ) + + # Try deleting timelines while under load + sk = env.safekeepers[0] + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + + # Delete first timeline + log.info(f"Deleting {timeline_id_a}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"] + + # Delete second timeline + log.info(f"Deleting {timeline_id_b}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"] + + # Verify timelines are gone from disk + sk_data_dir = sk.data_dir + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists() + # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists() + + finally: + log.info("Stopping endpoints...") + # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang + endpoint_a.stop(mode="immediate") + endpoint_b.stop(mode="immediate") + log.info("Joining threads...") + t_a.join() + t_b.join() + + +class RemoteDeleteFailpoint(StrEnum): + PAUSE = "sk-delete-timeline-remote-pause" + FAIL = "sk-delete-timeline-remote" + + +@pytest.mark.parametrize("failpoint", [RemoteDeleteFailpoint.PAUSE, RemoteDeleteFailpoint.FAIL]) +def test_safekeeper_delete_remote_errors( + neon_env_builder: NeonEnvBuilder, failpoint: RemoteDeleteFailpoint +): + """ + Test that errors and delays during remote deletion are handled correctly. + """ + + # Configure safekeepers with ultra-fast eviction policy + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--delete-offloaded-wal", + "--control-file-save-interval", + "1s", + ] + neon_env_builder.enable_safekeeper_remote_storage(s3_storage()) + env = neon_env_builder.init_start() + + # FIXME: pageserver is intermittently emitting this + env.pageserver.allowed_errors.extend( + [ + ".*unsupported command START_WAL_PUSH in START_WAL_PUSH.*", + ] + ) + + timeline_id_a = env.create_branch("deleteme_a") + endpoint_a = env.endpoints.create("deleteme_a") + endpoint_a.start() + with closing(endpoint_a.connect()) as conn: + with conn.cursor() as cur: + # roughly fills one segment + cur.execute("create table t(key int, value text)") + cur.execute("insert into t select generate_series(1,250000), 'payload'") + endpoint_a.stop() + + # Ensure something is uploaded to remote storage + def assert_is_uploaded(): + assert is_segment_offloaded( + env.safekeepers[0], env.initial_tenant, timeline_id_a, Lsn("0/2000000") + ) + + wait_until(assert_is_uploaded) + + def list_timeline_remote(): + assert isinstance(env.safekeepers_remote_storage, S3Storage) + prefix = f"{env.safekeepers_remote_storage.safekeeper_timeline_path(env.initial_tenant, timeline_id_a)}/" + + listing = env.safekeepers_remote_storage.client.list_objects_v2( + Bucket=env.safekeepers_remote_storage.bucket_name, + Prefix=prefix, + ) + return listing.get("Contents", []) + + assert list_timeline_remote() != [] + + sk_http = env.safekeepers[0].http_client() + env.pageserver.http_client().timeline_delete(env.initial_tenant, timeline_id_a) + + # Set up failpoint + if failpoint == RemoteDeleteFailpoint.PAUSE: + sk_http.configure_failpoints((failpoint, "pause")) + elif failpoint == RemoteDeleteFailpoint.FAIL: + sk_http.configure_failpoints((failpoint, "return")) + else: + raise NotImplementedError(f"Unknown failpoint: {failpoint}") + + # Delete the timeline - this should hit the configured failpoint + if failpoint == RemoteDeleteFailpoint.PAUSE: + # Expect time out + with pytest.raises(requests.exceptions.ReadTimeout, match="timed out"): + sk_http.timeline_delete(env.initial_tenant, timeline_id_a, timeout=5) + + # Assert deletion didn't happy yet + assert list_timeline_remote() != [] + + # Unblock the background task that should still be running + sk_http.configure_failpoints((failpoint, "off")) + + # Expect that after unblocking, remote deletion proceeds + def assert_remote_deleted(): + assert list_timeline_remote() == [] + + wait_until(assert_remote_deleted) + + elif failpoint == RemoteDeleteFailpoint.FAIL: + # Expect immediate failure + with pytest.raises(sk_http.HTTPError, match="Internal Server Error"): + sk_http.timeline_delete(env.initial_tenant, timeline_id_a) + + sk_http.configure_failpoints((failpoint, "off")) + else: + raise NotImplementedError(f"Unknown failpoint: {failpoint}") + + # Retry should succeed + sk_http.timeline_delete(env.initial_tenant, timeline_id_a) + + # Remote storage should be empty + assert list_timeline_remote() == [] diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index f58bbcd3c0..cb28f5b12d 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1814,14 +1814,3 @@ def test_sharding_gc( shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}") assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn - - for ps in env.pageservers: - # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by - # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. - # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed - ps.allowed_errors.extend( - [ - ".*could not find data for key.*", - ".*could not ingest record.*", - ] - ) diff --git a/test_runner/regress/test_signed_char.py b/test_runner/regress/test_signed_char.py new file mode 100644 index 0000000000..8752a1ff3f --- /dev/null +++ b/test_runner/regress/test_signed_char.py @@ -0,0 +1,64 @@ +from pathlib import Path + +from fixtures.neon_fixtures import NeonEnv + +SIGNED_CHAR_EXTRACT = """ + WITH + -- Generates an intermediate table with block numbers of the index + pagenumbers AS ( + SELECT num FROM generate_series(0, (pg_relation_size('test_payload_idx') / 8192) - 1) it(num) + ) + SELECT num, + -- Gets the data of the page, skipping the first 8 bytes which is the LSN + substr(page, 9, 8192-8), + -- Returns information about the GIN index opaque area + (gin_page_opaque_info(page)).* + FROM pagenumbers, + -- Gets a page from the respective blocks of the table + LATERAL (SELECT get_raw_page('test_payload_idx', num)) AS p(page) + -- Filters to only return leaf pages from the GIN Index + WHERE ARRAY['leaf'] = ((gin_page_opaque_info(page)).flags); + """ + + +def test_signed_char(neon_simple_env: NeonEnv): + """ + Test that postgres was compiled with -fsigned-char. + --- + In multi-character keys, the GIN index creates a CRC Hash of the first 3 bytes of the key. + The hash can have the first bit to be set or unset, needing to have a consistent representation + of char across architectures for consistent results. GIN stores these keys by their hashes + which determines the order in which the keys are obtained from the GIN index. + Using -fsigned-char enforces this order across platforms making this consistent. + The following query gets all the data present in the leaf page of a GIN index, + which is ordered by the CRC hash and is consistent across platforms. + """ + env = neon_simple_env + endpoint = env.endpoints.create_start("main") + + with endpoint.connect().cursor() as ses1: + # Add the required extensions + ses1.execute("CREATE EXTENSION pg_trgm;") + ses1.execute("CREATE EXTENSION pageinspect;") + # Create a test table + ses1.execute("CREATE TABLE test (payload text);") + # Create a GIN based index + ses1.execute( + "CREATE INDEX test_payload_idx ON test USING gin (payload gin_trgm_ops) WITH (gin_pending_list_limit = 64);" + ) + # insert a multibyte character to trigger order-dependent hashing + ses1.execute( + "INSERT INTO test SELECT '123456789BV' || CHR(127153) /* ace of spades, a multibyte character */ || i::text from generate_series(1, 40) as i(i);" + ) + ses1.execute("INSERT INTO test SELECT 'Bóbr';") + # Clean pending list to flush data to pages + ses1.execute("select gin_clean_pending_list('test_payload_idx'::regclass);") + ses1.execute(SIGNED_CHAR_EXTRACT) + pages = ses1.fetchall() + # Compare expected output + page1 = pages[0] + data = bytes(page1[1]).hex() + with open(Path(__file__).parent / "data" / "test_signed_char.out", encoding="utf-8") as f: + expected = f.read().rstrip() + + assert data == expected diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index d44c176b35..0f4e5688a9 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -312,17 +312,6 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ drop_local_state(env, tenant_id) workload.validate() - for ps in env.pageservers: - # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by - # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. - # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed - ps.allowed_errors.extend( - [ - ".*could not find data for key.*", - ".*could not ingest record.*", - ] - ) - def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 4865178ca8..b30c02e0e4 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -327,9 +327,9 @@ def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): log.info(f"pgbench run {i+1}/{PGBENCH_RUNS}") endpoint.safe_psql(f"create database {dbname}") connstr = endpoint.connstr(dbname=dbname) - # pgbench -i will automatically vacuum the tables. This creates the visibility map. - pg_bin.run(["pgbench", "-i", "-s", "10", connstr]) - # Freeze the tuples to set the initial frozen bit. + # Initialize the data set, but don't vacuum yet. + pg_bin.run(["pgbench", "-i", "-s", "8", "-n", connstr]) + # Vacuum to create the visibility map, and freeze the tuples to set the frozen bit. endpoint.safe_psql("vacuum freeze", dbname=dbname) # Run pgbench. pg_bin.run(["pgbench", "-c", "32", "-j", "8", "-T", "10", connstr]) @@ -354,19 +354,3 @@ def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): row = cur.fetchone() assert row is not None assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)" - - # Vacuum and freeze the tables, and check that the visibility map is still accurate. - for dbname in dbnames: - log.info(f"Vacuuming and checking visibility map for {dbname}") - with endpoint.cursor(dbname=dbname) as cur: - cur.execute("vacuum freeze") - - cur.execute("select count(*) from pg_check_visible('pgbench_accounts')") - row = cur.fetchone() - assert row is not None - assert row[0] == 0, f"{row[0]} inconsistent VM pages (visible)" - - cur.execute("select count(*) from pg_check_frozen('pgbench_accounts')") - row = cur.fetchone() - assert row is not None - assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)" diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 0a05189bfb..0366e88389 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -27,7 +27,6 @@ from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( Endpoint, NeonEnvBuilder, - NeonPageserver, PgBin, PgProtocol, Safekeeper, @@ -38,8 +37,6 @@ from fixtures.pageserver.utils import ( assert_prefix_empty, assert_prefix_not_empty, timeline_delete_wait_completed, - wait_for_last_record_lsn, - wait_for_upload, ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor @@ -55,9 +52,16 @@ from fixtures.safekeeper.http import ( TimelineCreateRequest, ) from fixtures.safekeeper.utils import wait_walreceivers_absent +from fixtures.safekeeper_utils import ( + is_flush_lsn_caught_up, + is_segment_offloaded, + is_wal_trimmed, + wait_lsn_force_checkpoint, + wait_lsn_force_checkpoint_at, + wait_lsn_force_checkpoint_at_sk, +) from fixtures.utils import ( PropagatingThread, - get_dir_size, query_scalar, run_only_on_default_postgres, skip_in_debug_build, @@ -69,68 +73,6 @@ if TYPE_CHECKING: from typing import Any, Self -def wait_lsn_force_checkpoint( - tenant_id: TenantId, - timeline_id: TimelineId, - endpoint: Endpoint, - ps: NeonPageserver, - pageserver_conn_options=None, -): - pageserver_conn_options = pageserver_conn_options or {} - lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") - - wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) - - -def wait_lsn_force_checkpoint_at_sk( - safekeeper: Safekeeper, - tenant_id: TenantId, - timeline_id: TimelineId, - ps: NeonPageserver, - pageserver_conn_options=None, -): - sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id) - wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options) - - -def wait_lsn_force_checkpoint_at( - lsn: Lsn, - tenant_id: TenantId, - timeline_id: TimelineId, - ps: NeonPageserver, - pageserver_conn_options=None, -): - """ - Wait until pageserver receives given lsn, force checkpoint and wait for - upload, i.e. remote_consistent_lsn advancement. - """ - pageserver_conn_options = pageserver_conn_options or {} - - auth_token = None - if "password" in pageserver_conn_options: - auth_token = pageserver_conn_options["password"] - - # wait for the pageserver to catch up - wait_for_last_record_lsn( - ps.http_client(auth_token=auth_token), - tenant_id, - timeline_id, - lsn, - ) - - # force checkpoint to advance remote_consistent_lsn - ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) - - # ensure that remote_consistent_lsn is advanced - wait_for_upload( - ps.http_client(auth_token=auth_token), - tenant_id, - timeline_id, - lsn, - ) - - @dataclass class TimelineMetrics: timeline_id: TimelineId @@ -475,31 +417,6 @@ def wait(f, desc, timeout=30, wait_f=None): wait_f() -def is_segment_offloaded( - sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn -): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"sk status is {tli_status}") - return tli_status.backup_lsn >= seg_end - - -def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"sk status is {tli_status}") - return tli_status.flush_lsn >= lsn - - -def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id)) - sk_wal_size_mb = sk_wal_size / 1024 / 1024 - log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") - return sk_wal_size_mb <= target_size_mb - - def test_wal_backup(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 remote_storage_kind = s3_storage() @@ -1685,214 +1602,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) -@pytest.mark.parametrize("auth_enabled", [False, True]) -def test_delete(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): - neon_env_builder.auth_enabled = auth_enabled - env = neon_env_builder.init_start() - - # FIXME: are these expected? - env.pageserver.allowed_errors.extend( - [ - ".*Timeline .* was not found in global map.*", - ".*Timeline .* was cancelled and cannot be used anymore.*", - ] - ) - - # Create two tenants: one will be deleted, other should be preserved. - tenant_id = env.initial_tenant - timeline_id_1 = env.create_branch("br1") # Active, delete explicitly - timeline_id_2 = env.create_branch("br2") # Inactive, delete explicitly - timeline_id_3 = env.create_branch("br3") # Active, delete with the tenant - timeline_id_4 = env.create_branch("br4") # Inactive, delete with the tenant - - tenant_id_other, timeline_id_other = env.create_tenant() - - # Populate branches - endpoint_1 = env.endpoints.create_start("br1") - endpoint_2 = env.endpoints.create_start("br2") - endpoint_3 = env.endpoints.create_start("br3") - endpoint_4 = env.endpoints.create_start("br4") - endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other) - for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]: - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE TABLE t(key int primary key)") - sk = env.safekeepers[0] - sk_data_dir = sk.data_dir - if not auth_enabled: - sk_http = sk.http_client() - sk_http_other = sk_http - else: - sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) - sk_http_other = sk.http_client( - auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) - ) - sk_http_noauth = sk.http_client(gen_sk_wide_token=False) - assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. - endpoint_2.stop_and_destroy() - endpoint_4.stop_and_destroy() - sk.stop() - sk.start() - - # Ensure connections to Safekeeper are established - for endpoint in [endpoint_1, endpoint_3, endpoint_other]: - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("INSERT INTO t (key) VALUES (1)") - - # Stop all computes gracefully before safekeepers stop responding to them - endpoint_1.stop_and_destroy() - endpoint_3.stop_and_destroy() - - # Remove initial tenant's br1 (active) - assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Ensure repeated deletion succeeds - assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - if auth_enabled: - # Ensure we cannot delete the other tenant - for sk_h in [sk_http, sk_http_noauth]: - with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): - assert sk_h.timeline_delete(tenant_id_other, timeline_id_other) - with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): - assert sk_h.tenant_delete_force(tenant_id_other) - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant's br2 (inactive) - assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove non-existing branch, should succeed - assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant fully (two branches are active) - response = sk_http.tenant_delete_force(tenant_id) - assert response[str(timeline_id_3)]["dir_existed"] - assert not (sk_data_dir / str(tenant_id)).exists() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant again. - response = sk_http.tenant_delete_force(tenant_id) - # assert response == {} - assert not (sk_data_dir / str(tenant_id)).exists() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Ensure the other tenant still works - sk_http_other.timeline_status(tenant_id_other, timeline_id_other) - with closing(endpoint_other.connect()) as conn: - with conn.cursor() as cur: - cur.execute("INSERT INTO t (key) VALUES (123)") - - -def test_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder): - """ - Test deleting timelines on a safekeeper while they're under load. - - This should not happen under normal operation, but it can happen if - there is some rogue compute/pageserver that is writing/reading to a - safekeeper that we're migrating a timeline away from, or if the timeline - is being deleted while such a rogue client is running. - """ - neon_env_builder.auth_enabled = True - env = neon_env_builder.init_start() - - # Create two endpoints that will generate load - timeline_id_a = env.create_branch("deleteme_a") - timeline_id_b = env.create_branch("deleteme_b") - - endpoint_a = env.endpoints.create("deleteme_a") - endpoint_a.start() - endpoint_b = env.endpoints.create("deleteme_b") - endpoint_b.start() - - # Get tenant and timeline IDs - tenant_id = env.initial_tenant - - # Start generating load on both timelines - def generate_load(endpoint: Endpoint): - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") - while True: - try: - cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'") - except: # noqa - # Ignore errors since timeline may be deleted - break - - t_a = threading.Thread(target=generate_load, args=(endpoint_a,)) - t_b = threading.Thread(target=generate_load, args=(endpoint_b,)) - try: - t_a.start() - t_b.start() - - # Let the load run for a bit - log.info("Warming up...") - time.sleep(2) - - # Safekeeper errors will propagate to the pageserver: it is correct that these are - # logged at error severity because they indicate the pageserver is trying to read - # a timeline that it shouldn't. - env.pageserver.allowed_errors.extend( - [ - ".*Timeline.*was cancelled.*", - ".*Timeline.*was not found.*", - ] - ) - - # Try deleting timelines while under load - sk = env.safekeepers[0] - sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) - - # Delete first timeline - log.info(f"Deleting {timeline_id_a}...") - assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"] - - # Delete second timeline - log.info(f"Deleting {timeline_id_b}...") - assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"] - - # Verify timelines are gone from disk - sk_data_dir = sk.data_dir - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists() - # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists() - - finally: - log.info("Stopping endpoints...") - # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang - endpoint_a.stop(mode="immediate") - endpoint_b.stop(mode="immediate") - log.info("Joining threads...") - t_a.join() - t_b.join() - - # Basic pull_timeline test. # When live_sk_change is False, compute is restarted to change set of # safekeepers; otherwise it is live reload. @@ -2281,6 +1990,54 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): http_cli.timeline_status(tenant_id, timeline_id) +def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): + """ + Test that having neon.safekeepers starting with g#n: with non zero n enables + generations, which as a side effect disables automatic timeline creation. + + This is kind of bootstrapping test: here membership conf & timeline is + created manually, later storcon will do that. + """ + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + ps = env.pageservers[0] + ps_http_cli = ps.http_client() + + http_clis = [sk.http_client() for sk in env.safekeepers] + + config_lines = [ + "neon.safekeeper_proto_version = 3", + ] + ep = env.endpoints.create("main", config_lines=config_lines) + + # expected to fail because timeline is not created on safekeepers + with pytest.raises(Exception, match=r".*timed out.*"): + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s") + # figure out initial LSN. + ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id) + init_lsn = ps_timeline_detail["last_record_lsn"] + log.info(f"initial LSN: {init_lsn}") + # sk timeline creation request expects minor version + pg_version = ps_timeline_detail["pg_version"] * 10000 + # create inital mconf + sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers] + mconf = Configuration(generation=1, members=sk_ids, new_members=None) + create_r = TimelineCreateRequest( + tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None + ) + log.info(f"sending timeline create: {create_r.to_json()}") + + for sk_http_cli in http_clis: + sk_http_cli.timeline_create(create_r) + # Once timeline created endpoint should start. + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") + + # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # when compute is active, but there are no writes to the timeline. In that case # pageserver should maintain a single connection to safekeeper and don't attempt diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 6254ab9b44..b1425505c6 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 6254ab9b4496c3e481bc037ae69d859bbc2bdd7d +Subproject commit b1425505c6f9a622a5aadf3ee362740519993310 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 9b118b1cff..533be42f7d 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 9b118b1cffa6e4ca0d63389b57b54d11e207e9a8 +Subproject commit 533be42f7da97e614ce1c494fafe3e49f53991b1 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 799e7a08dd..78050f965f 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 799e7a08dd171aa06a7395dd326f4243aaeb9f93 +Subproject commit 78050f965f2e550fd6e58f837394cb3d080d7d42 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 517b8dc244..780efda2ef 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 517b8dc244abf3e56f0089849e464af76f70b94e +Subproject commit 780efda2ef8d629495cc289624534ba8cde40779 diff --git a/vendor/revisions.json b/vendor/revisions.json index 8dde46a01e..1a811cfa3d 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.4", - "517b8dc244abf3e56f0089849e464af76f70b94e" + "780efda2ef8d629495cc289624534ba8cde40779" ], "v16": [ "16.8", - "799e7a08dd171aa06a7395dd326f4243aaeb9f93" + "78050f965f2e550fd6e58f837394cb3d080d7d42" ], "v15": [ "15.12", - "9b118b1cffa6e4ca0d63389b57b54d11e207e9a8" + "533be42f7da97e614ce1c494fafe3e49f53991b1" ], "v14": [ "14.17", - "6254ab9b4496c3e481bc037ae69d859bbc2bdd7d" + "b1425505c6f9a622a5aadf3ee362740519993310" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 1b7c376560..183cc66ab9 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -18,7 +18,7 @@ license.workspace = true ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } -base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] } +base64-647d43efb71741da = { package = "base64", version = "0.21" } base64ct = { version = "1", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } camino = { version = "1", default-features = false, features = ["serde1"] }